# Dataset & Libraries 

In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import re 
import nltk
nltk.download('stopwords')
from tqdm import tqdm
import unicodedata
import gensim
import sklearn.model_selection as sms
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

In [None]:
train_df = pd.read_json("train.json").set_index('Id')
test_df = pd.read_json("test.json").set_index('Id')
train_label = pd.read_csv("train_label.csv").set_index('Id')
categories_df= pd.read_csv("categories_string.csv")

In [None]:
data_train=pd.concat([train_df,train_label],axis=1)

# I - Cleaning 

In [None]:
from clean import CleanText
ct = CleanText()

In [None]:
ct.clean_df_column(data_train, "description","description_cleaned")

In [None]:
ct.clean_df_column(test_df, "description","description_cleaned")

# II - Vectorization: TF-IDF

In [None]:
datatrain, datavalid = sms.train_test_split(train_df, test_size=0.1, random_state=42)
transformer = TfidfVectorizer().fit(datatrain["description_cleaned"].values)
print("NB features: %d" %(len(transformer.vocabulary_)))
X_train = transformer.transform(datatrain["description_cleaned"].values)
X_valid = transformer.transform(datavalid["description_cleaned"].values)
Y_train = datatrain.Category.values
Y_valid = datavalid.Category.values
X_train

## Models

### 1) Logistic regression

In [None]:
grid = {"C":[0.1, 1, 5,10]}
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10,scoring="f1_macro")
start_time = time.time()
logreg_cv.fit(X_train,Y_train)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
lr_predictions = logreg_cv.predict(X_valid)
f1_score(Y_valid,lr_predictions,average="macro")

### 2) MLP

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=256,random_state=0)
start_time = time.time()
model_mlp=mlp.fit(X_train, Y_train)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
predictions_mlp = model_mlp.predict(X_test)
f1_score(Y_valid,prediction_mlp,average="macro")

### 3) Random Forest

In [None]:
param_grid = {
'n_estimators': [500, 800]
}

rf=RandomForestClassifier(random_state=0)
rf_grid_search = GridSearchCV(rf, param_grid = param_grid,
cv = 3, n_jobs = -1, verbose = 2,scoring="f1_macro")

ts = time.time()
rf_grid_search.fit(X_train,Y_train)
te=time.time()
temps=te-ts
print("Time =", temps)

In [None]:
predictions_rf = model_rf.predict(X_test)
f1_score(Y_valid,prediction_rf,average="macro")