# Dataset & Libraries 

In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import re 
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')
from tqdm import tqdm
import unicodedata
import gensim
import sklearn.model_selection as sms
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
import time
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

In [None]:
train_df = pd.read_json("data/train.json").set_index('Id')
test_df = pd.read_json("data/test.json").set_index('Id')
train_label = pd.read_csv("data/train_label.csv").set_index('Id')
categories_df= pd.read_csv("data/categories_string.csv")

In [None]:
data_train=pd.concat([train_df,train_label],axis=1)

# I - Cleaning 

In [None]:
from clean import CleanText
ct = CleanText()

In [None]:
ct.clean_df_column(data_train, "description","description_cleaned")

In [None]:
ct.clean_df_column(test_df, "description","description_cleaned")

# II - Vectorization: Word2Vec

## 1) WordEmbedding

In [None]:
features_dimension = 300
min_count = 1
window = 5
hs = 0
negative = 10

In [None]:
array_token = [line.split(" ") for line in data_train["description_cleaned"].values]
test_array_token = [line.split(" ") for line in test_df["description_cleaned"].values]

In [None]:
from word_embedding import WordEmbedding

we_sg = WordEmbedding(word_embedding_type = "word2vec", 
                      args = dict(sentences = array_token, sg=1, hs=hs, negative=negative, min_count=min_count, size=features_dimension, window = window, iter=10))
model_sg, training_time_sg = we_sg.train()
print("Model Skip-gram trained in %.2f minutes"%(training_time_sg/60))
model_sg.save("model_sg_100k")

we_cbow = WordEmbedding(word_embedding_type = "word2vec", 
                      args = dict(sentences = array_token, sg=0, hs=hs, negative=negative, min_count=min_count, size=features_dimension, window = window, iter=10))
model_cbow, training_time_cbow = we_cbow.train()
print("Model CBOW trained in %.2f minutes"%(training_time_cbow/60))
model_cbow.save("model_cbow_100k")



In [None]:
datatrain, datavalid = sms.train_test_split(data_train, test_size=0.1, random_state=42)
train_array_token = [line.split(" ") for line in datatrain["description_cleaned"].values]
valid_array_token = [line.split(" ") for line in datavalid["description_cleaned"].values]
test_array_token = [line.split(" ") for line in test_df["description_cleaned"].values]

### a) Cbow

In [None]:
X_embedded_train_cbow, embedded_conversion_train_time_cbow = WordEmbedding.get_matrix_features_means(array_token, model_cbow)
X_embedded_valid_cbow, embedded_conversion_valid_time_cbow = WordEmbedding.get_matrix_features_means(valid_array_token, model_cbow)
X_embedded_test_cbow, embedded_conversion_test_time_cbow = WordEmbedding.get_matrix_features_means(test_array_token, model_cbow)

### b) Skipgram

In [None]:
X_embedded_train_sg, embedded_conversion_train_time_sg = WordEmbedding.get_matrix_features_means(array_token, model_sg)
X_embedded_valid_sg, embedded_conversion_valid_time_sg = WordEmbedding.get_matrix_features_means(valid_array_token, model_sg)
X_embedded_test_sg, embedded_conversion_test_time_sg = WordEmbedding.get_matrix_features_means(test_array_token, model_sg)

In [None]:
X_train = X_embedded_train_sg
Y_train=datatrain.Category.values
X_valid = X_embedded_valid_sg
Y_valid=datavalid.Category.values
X_test =X_embedded_test_sg

X_train_cbow = X_embedded_train_cbow
Y_train_cbow = data_train.Category.values
X_valid_cbow = X_embedded_valid_cbow
Y_valid_cbow = datavalid.Category.values

## 2) Models

### a) Logistic regression

In [None]:
grid = {"C":[0.1, 1, 5,10]}
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10,scoring="f1_macro")

In [None]:
ts = time.time()
logreg_cv.fit(X_train,Y_train)
te=time.time()
temps=te-ts
print("Time =", temps)

In [None]:
predictions = logreg_cv.predict(X_valid)

In [None]:
f1_score(Y_valid,predictions,average="macro")
print(logreg_cv.best_params_)

In [None]:
with open('lr_gs_w2v_skipgram.pkl', 'wb') as fid:
    pickle.dump(logreg_cv, fid) 

In [None]:
with open('lr_gs_w2v_skipgram_results.pkl', 'wb') as fid:
    pickle.dump(logreg_cv.cv_results_, fid) 

In [None]:
#for cbow method
grid = {"C":[0.1, 1, 5,10]}
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)

In [None]:
ts = time.time()
logreg_cv.fit(X_train_cbow,Y_train_cbow)
te=time.time()
temps=te-ts
print("Time =", temps)

In [None]:
predictions = logreg_cv.predict(X_valid_cbow)
f1_score(Y_valid_cbow,predictions,average="macro")

In [None]:
with open('lr_gs_w2v_cbow.pkl', 'wb') as fid:
    pickle.dump(logreg_cv, fid) 

In [None]:
with open('lr_gs_w2v_cbow_results.pkl', 'wb') as fid:
    pickle.dump(logreg_cv.cv_results_, fid) 

### b) MLP

In [None]:
param_grid={"hidden_layer_sizes" : [128, 256]}
mlp_grid_search = GridSearchCV(estimator = mlp, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
mlp_grid_search =mlp_grid_search.fit(X_train,Y_train)
print(mlp_grid_search.best_params_)

In [None]:
mlp_valid=mlp_grid_search.predict(X_valid,Y_valid)
f1_score(Y_valid_cbow,mlp_valid,average="macro")

### c) Random Forest

In [None]:
param_grid = {
    'n_estimators': [500, 800]
}

rf=RandomForestClassifier(random_state=0)
rf_grid_search = GridSearchCV(rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2,scoring="f1_macro")

ts = time.time()
rf_grid_search.fit(X_train,Y_train)
te=time.time()
temps=te-ts
print("Time =", temps)

In [None]:
with open('rf_gs_w2v_skipgram.pkl', 'wb') as fid:
    pickle.dump(rf_grid_search, fid) 

In [None]:
rf_prediction=rf.predict(X_valid,Y_valid)
f1_score(Y_valid,rf_prediction,average="macro")