In [1]:
from sklearn.preprocessing import Normalizer
import tensorflow as tf
from numpy import array
from numpy import asarray
from numpy import zeros
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")
import gc

import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import math
import lightgbm as lgb
from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import scipy
import scipy.sparse
from scipy.sparse import hstack

from sklearn.metrics import mean_squared_log_error
import pickle
import regex as re
import os

!pip install -q pyyaml h5py  # Required to save models in HDF5 format

In [2]:
train_path = '../input/mercari-data-processed/train_processed.pkl'
test_path = '../input/mercari-data-processed/test_processed.pkl'

train_df = pd.read_pickle(train_path)

### Split original train data into sub-train data and test data

In [3]:
X = train_df.drop(['train_id', 'category_name', 'price', 'log_prices'], axis='columns')
y = train_df['log_prices']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1302134, 14) (1302134,)
(144682, 14) (144682,)


### Data Vectorizer

In [5]:
def vectorize_data(col_data, count_vectorizer=None):
    """
    Description:
    This funciton will count vectorize the input column data.

    Input: dataframe column
    Output: one-hot encoded values and the fitted vectorizer
    """
    if count_vectorizer==None:
        count_vectorizer = CountVectorizer()
        count_vectorizer.fit(col_data)
    ohe_data = count_vectorizer.transform(col_data)
    return ohe_data, count_vectorizer

In [6]:
def glove_vectorize_text_data(col):
    """
    Description:
    This funciton will glove vectorize (100 dimensions) the input text data

    Input: dataframe column
    Output: glove vectorized text data
    """
    glove_path = '../input/glove-100d/glove.6B.100d.txt'
    embeddings_index = dict()
    with open(glove_path) as f:
        for line in f:
            w_coef = line.split()
            word = w_coef[0]
            coefs = np.asarray(w_coef[1:], dtype='float32')
            embeddings_index[word]=coefs
    print(len(embeddings_index))

    # Avg Word2Vec
    # compute Avg word2vec for each description.
    avg_w2v_vectors = [] # the tfidf-w2v for each sentence is stored in this list
    for sentence in tqdm(col): # for each sentence
        vector = np.zeros(100) # as word vectors are of zero length
        word_count=0 # num of words with a valid vector in the sentence
        for word in str(sentence).split(): # for each word in a sentence
            if word in embeddings_index.keys():
                vec = embeddings_index[word] # getting the vector for each word
                vector += vec
                word_count += 1
        if word_count != 0:
            vector /= word_count
        avg_w2v_vectors.append(vector)

    return np.array(avg_w2v_vectors)

### Train data

In [7]:
general_cat_ohe_train, general_cat_vectorizer_train = vectorize_data(X_train['general_cat'].values.astype('U'))
subcat_1_ohe_train, subcat_1_vectorizer_train = vectorize_data(X_train['subcat_1'].values.astype('U'))
subcat_2_ohe_train, subcat_2_vectorizer_train = vectorize_data(X_train['subcat_2'].values.astype('U'))
brand_name_ohe_train, brand_name_vectorizer_train = vectorize_data(X_train['brand_name'].values.astype('U'))
item_name_ohe_train, item_name_vectorizer_train = vectorize_data(X_train['name'].values.astype('U'))

In [8]:
avg_w2v_vectors_train = glove_vectorize_text_data(X_train['item_description'].values)
avg_w2v_vectors_train.shape

400001


  0%|          | 0/1302134 [00:00<?, ?it/s]

(1302134, 100)

In [9]:
X_train_stack = hstack((general_cat_ohe_train, subcat_1_ohe_train, subcat_2_ohe_train, brand_name_ohe_train, item_name_ohe_train, avg_w2v_vectors_train, 
               X_train['desc_neg'].values.reshape(-1,1), 
               X_train['desc_neu'].values.reshape(-1,1),
               X_train['desc_pos'].values.reshape(-1,1),
               X_train['scaler_desc_len'].values.reshape(-1,1),
               X_train['scaler_name_len'].values.reshape(-1,1), 
               X_train['item_condition_id'].values.reshape(-1,1), 
               X_train['is_expensive'].values.reshape(-1,1), 
               X_train['shipping'].values.reshape(-1,1))).tocsr()

print("="*50)
print("Final Data matrix:")
print(X_train_stack.shape, y.shape)
print("="*50)

X_train_size = X_train_stack.data.nbytes + X_train_stack.indptr.nbytes + X_train_stack.indices.nbytes
print("Size of X_tr in memory =", int(X_train_size/1024/1024),"MB")

Final Data matrix:
(1302134, 103395) (1446816,)
Size of X_tr in memory = 1722 MB


### Test Data

In [10]:
general_cat_ohe_test, general_cat_vectorizer_test = vectorize_data(X_test['general_cat'].values.astype('U'), general_cat_vectorizer_train)
subcat_1_ohe_test, subcat_1_vector_test = vectorize_data(X_test['subcat_1'].values.astype('U'), subcat_1_vectorizer_train)
subcat_2_ohe_test, subcat_2_vectorizer_test = vectorize_data(X_test['subcat_2'].values.astype('U'), subcat_2_vectorizer_train)
brand_name_ohe_test, brand_name_vectorizer_test = vectorize_data(X_test['brand_name'].values.astype('U'), brand_name_vectorizer_train)
item_name_ohe_test, item_name_vectorizer_test = vectorize_data(X_test['name'].values.astype('U'), item_name_vectorizer_train)

In [11]:
avg_w2v_vectors_test = glove_vectorize_text_data(X_test['item_description'].values)
avg_w2v_vectors_test.shape

400001


  0%|          | 0/144682 [00:00<?, ?it/s]

(144682, 100)

In [12]:
X_test_stack = hstack((general_cat_ohe_test, subcat_1_ohe_test, subcat_2_ohe_test, brand_name_ohe_test, item_name_ohe_test, avg_w2v_vectors_test, 
               X_test['desc_neg'].values.reshape(-1,1), 
               X_test['desc_neu'].values.reshape(-1,1),
               X_test['desc_pos'].values.reshape(-1,1),
               X_test['scaler_desc_len'].values.reshape(-1,1),
               X_test['scaler_name_len'].values.reshape(-1,1), 
               X_test['item_condition_id'].values.reshape(-1,1), 
               X_test['is_expensive'].values.reshape(-1,1), 
               X_test['shipping'].values.reshape(-1,1))).tocsr()

print("="*50)
print("Final Data matrix:")
print(X_test_stack.shape, y_test.shape)
print("="*50)

X_test_stack_size = X_test_stack.data.nbytes + X_test_stack.indptr.nbytes + X_test_stack.indices.nbytes
print("Size of X_te in memory =", int(X_test_stack_size/1024/1024),"MB")

Final Data matrix:
(144682, 103395) (144682,)
Size of X_te in memory = 191 MB


In [13]:
scipy.sparse.save_npz('./mercari_mlp_train_input.npz', X_train_stack)

In [14]:
scipy.sparse.save_npz('./mercari_mlp_test_input.npz', X_test_stack)

In [15]:
scipy.sparse.save_npz('./mercari_mlp_train_labels.npz', scipy.sparse.csr_matrix(y_train).reshape(-1,1))

In [16]:
scipy.sparse.save_npz('./mercari_mlp_test_labels.npz', scipy.sparse.csr_matrix(y_test).reshape(-1,1))

### Split sub-train data into train input and val input

In [17]:
# y_train = np.asarray(y_train)
# y_test = np.asarray(y_test)

In [18]:
# X_tr, X_val, y_tr, y_val = train_test_split(X_train_stack, y_train, test_size=0.10, random_state=42)
# print(X_tr.shape, y_tr.shape)
# print(X_val.shape, y_val.shape)

In [19]:
# scipy.sparse.save_npz('./mercari_mlp_train_input.npz', X_tr)

In [20]:
# scipy.sparse.save_npz('./mercari_mlp_val_input.npz', X_val)

In [21]:
# scipy.sparse.save_npz('./mercari_mlp_test_input.npz', X_te)

#### Converting label to sparse matrix and reshape from (dim1, dim2) to (dim2, dim1) as it's diffirent shape as numpy labels format

In [22]:
# scipy.sparse.save_npz('./mercari_mlp_train_labels.npz', scipy.sparse.csr_matrix(y_tr).reshape(-1,1))

In [23]:
# scipy.sparse.save_npz('./mercari_mlp_val_labels.npz', scipy.sparse.csr_matrix(y_val).reshape(-1,1))

In [24]:
# scipy.sparse.save_npz('./mercari_mlp_test_labels.npz', scipy.sparse.csr_matrix(y_test).reshape(-1,1))