# Test of the Naive Bayes classifiers

## I. On the four categories

In [197]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [20]:
# Load the dataset
dataset = pd.read_csv('datasets/new_train.csv', index_col=0)
X = dataset.drop('label',axis=1).values
y = dataset['label'].values

In [21]:
# Standardize features
sc = StandardScaler()
X = sc.fit_transform(X)

In [22]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [23]:
# Fit Random Forest Classification to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train);

In [24]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.3948942959712804

The Bayesian classifier is obviously worse than anything else so far. No need to go any further.

## II. Binary classification

We saw during our data visualization that the label 2 might be easier to seperate from others. Therefore, we will try to proceed to a binary classification on the label 2 on the one hand, and all other labels on the other hand.

In [198]:
#Dataset opening
dataset = pd.read_csv('../datasets/new_train.csv',index_col=0)
train_modif=dataset.copy()
train_modif.label.replace(to_replace={1:0, 3:0}, inplace=True)

train_modif.head(3)

Unnamed: 0_level_0,org,tld,ccs,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label,day,month,hour
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,coursera,org,0,multipart/alternative,23,188,0,1,38,136818,0,Thu,Mar,1.95
1,google,com,0,multipart/alternative,1,6,0,0,44,2467,0,Fri,Jan,5.333333
2,iiitd,ac.in,1,multipart/mixed,0,1,1,0,78,2809449,2,Mon,Aug,10.9


We still have to encode the string columns with one hot encoder !

In [199]:
from sklearn.preprocessing import OneHotEncoder
#making a copy of the former dataset
train_encoded=train_modif.copy()

#Removing anoying features like day and month

train_encoded.drop(['day','month'],axis=1,inplace=True)

columns=['org','tld','mail_type']
train_cat = train_encoded[columns]
columns+=['label']
train_num = train_encoded.drop(columns,axis=1)
y=train_encoded['label'].values

X_train_cat = train_cat.values

encoder=OneHotEncoder(handle_unknown='ignore',sparse=False)
X_encoded=encoder.fit_transform(X_train_cat)

X_fully_encoded=np.concatenate((X_encoded,train_num.values),axis=1)

In [201]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X_fully_encoded, y, test_size = 0.2, random_state = 0)

In [202]:
# Standardize features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [203]:
# Fit Random Forest Classification to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train);

In [207]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)
# print(y_test[:58],y_pred[:58])

0.9465496609493419

We just reached quite an amazing precision with a binary classifier ! Hopefully, we will have better performances with other models on the data that has not label 1 with our binary classifier (meaning the occurrences formerly labeled as 0, 1 or 3)

## III. Classification on the remaining data
### III. A. Without dimensionality reduction
#### III.A.a) KNN classification

In [209]:
#We must retrieve predictions on the whole training set
X = sc.transform(X_fully_encoded)
y_predicted = classifier.predict(X)

dataset['binary_label']=y_predicted
dataset.head(10)

Unnamed: 0_level_0,org,tld,ccs,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label,day,month,hour,binary_label
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,coursera,org,0,multipart/alternative,23,188,0,1,38,136818,0,Thu,Mar,1.95,0
1,google,com,0,multipart/alternative,1,6,0,0,44,2467,0,Fri,Jan,5.333333,0
2,iiitd,ac.in,1,multipart/mixed,0,1,1,0,78,2809449,2,Mon,Aug,10.9,2
3,na,na,0,multipart/alternative,4,43,0,0,61,13775,0,Fri,Mar,3.766667,2
4,linkedin,com,0,multipart/alternative,4,26,0,0,29,22601,1,Thu,Jun,3.85,0
5,iiitd,ac.in,0,multipart/alternative,0,28,1,0,37,15848,2,Mon,Oct,21.216667,2
6,flipkartletters,com,0,text/html,42,97,1,0,88,68612,3,Mon,Jan,2.466667,0
7,amazon,com,0,multipart/alternative,9,74,1,1,82,41354,0,Tue,Jan,3.333333,0
8,classroom,google.com,0,multipart/alternative,4,10,0,0,101,4294,0,Thu,Aug,11.75,0
9,iiitd,ac.in,3,multipart/alternative,0,0,1,0,38,2004,2,Tue,May,13.716667,2


Now, we should only keep the lines that have been labeled 0 by the binary classifier and only then, drop the column 'binary_label'

In [210]:
new_train = dataset[dataset['binary_label']==0].drop('binary_label',axis=1)
new_train.head(3)

Unnamed: 0_level_0,org,tld,ccs,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label,day,month,hour
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,coursera,org,0,multipart/alternative,23,188,0,1,38,136818,0,Thu,Mar,1.95
1,google,com,0,multipart/alternative,1,6,0,0,44,2467,0,Fri,Jan,5.333333
4,linkedin,com,0,multipart/alternative,4,26,0,0,29,22601,1,Thu,Jun,3.85


It is time for a new hot encoding of the categorical data.

In [211]:
encoder2=OneHotEncoder(handle_unknown='ignore',sparse=False)
# Removing anoying features like day and month
new_train2 = new_train.drop(['day','month'],axis=1)

X_cat = new_train2[['org','tld','mail_type']].values
X_num = new_train2.drop(['org','tld','mail_type','label'],axis=1).values

In [212]:
X_cat_encoded = encoder2.fit_transform(X_cat)
X=np.concatenate((X_cat_encoded, X_num),axis=1)

y = new_train['label'].values

Now we can try the knn classifier on this new data

In [213]:
from sklearn.neighbors import KNeighborsClassifier as knn

In [214]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [215]:
# Fit Random Forest Classification to the Training set
classifier = knn(n_neighbors=3)
classifier.fit(X_train, y_train);

In [216]:
# Predicting the Test set results
y_pred1 = classifier.predict(X_test)
accuracy_score(y_test, y_pred1)

0.658820417878868

#### III.A.b) XGBoost

In [91]:
from xgboost import XGBClassifier

# Fit XGBoost to the Training set
classifier = XGBClassifier()
classifier.fit(X_train, y_train);

In [92]:
import pickle as pk

with open('../classifiers/xgboost.txt','wb') as fichier:
    pickler=pk.Pickler(fichier)
    pickler.dump(classifier)

In [93]:
# Predict the Test set results
y_pred1 = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.8664374504099445

#### III.A.c) Random Forest

In [115]:
from sklearn.ensemble import RandomForestClassifier

# Fit Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train);

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [116]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.9243586352816715

### III.B. With dimensionality reduction
#### III.B.a) KNN classification

In [109]:
# Dimensionality reduction
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=3)
X_train2 = lda.fit_transform(X_train, y_train)
X_test2 = lda.transform(X_test);



In [96]:
# Fit Random Forest Classification to the Training set
classifier = knn(n_neighbors=3)
classifier.fit(X_train2, y_train);

In [97]:
# Predicting the Test set results
y_pred2 = classifier.predict(X_test2)
accuracy_score(y_test, y_pred2)

0.89896852684475

Let's try it out with the PCA algorithm

In [105]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_train3 = pca.fit_transform(X_train)
X_test3 = pca.transform(X_test)

In [106]:
classifier = knn(n_neighbors=3)
classifier.fit(X_train, y_train);

In [107]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.918540068764877

#### III.B.b) XGBoost

With the lda dimensionality reduction

In [110]:
# Fit XGBoost to the Training set
classifier = XGBClassifier()
classifier.fit(X_train2, y_train);

In [112]:
# Predict the Test set results
y_pred = classifier.predict(X_test2)
accuracy_score(y_test, y_pred)

0.8976461253636604

With PCA

In [113]:
# Fit XGBoost to the Training set
classifier = XGBClassifier()
classifier.fit(X_train3, y_train);

In [112]:
# Predict the Test set results
y_pred = classifier.predict(X_test3)
accuracy_score(y_test, y_pred)

0.8976461253636604

#### III.B.c) Random Forest

With LDA

In [117]:
# Fit Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train2, y_train);

In [118]:
# Predicting the Test set results
y_pred = classifier.predict(X_test2)
accuracy_score(y_test, y_pred)

0.9034646918804549

With PCA

In [120]:
# Fit Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train3, y_train);

In [121]:
# Predicting the Test set results
y_pred = classifier.predict(X_test3)
accuracy_score(y_test, y_pred)

0.8534779158952658

## IV. Conclusion

It turns out that the best estimator is a naive bayes binary classifier combined with a random forest on the remaining data that was not classified as label 2. We must see how it goes with the test data set.

### IV. A. Binary classification first

In [178]:
# Load the dataset
df_train = pd.read_csv('../datasets/new_train.csv', index_col=0)
df_test = pd.read_csv('../datasets/new_test.csv',index_col=0)

train_modif = df_train.copy()
train_modif.label.replace(to_replace={1:0, 3:0, 2:1}, inplace=True)

In [179]:
train_encoded=train_modif.copy()
test_encoded=df_test.copy()

#Removing anoying features like day and month
train_encoded.drop(['day','month'],axis=1,inplace=True)
test_encoded.drop(['day','month'],axis=1,inplace=True)

#Seperating categorical values from others
train_cat = train_encoded[['org','tld','mail_type']]
train_num = train_encoded.drop(['org','tld','mail_type','label'],axis=1)
test_cat = test_encoded[['org','tld','mail_type']]
test_num = test_encoded.drop(['org','tld','mail_type'],axis=1)

#Saving the labels somewhere
y_train=train_encoded['label'].values

#Switching to arrays
X_train_cat = train_cat.values
X_test_cat = test_cat.values

encoder=OneHotEncoder(handle_unknown='ignore',sparse=False)
X_train_encoded = encoder.fit_transform(X_train_cat)
X_test_encoded = encoder.transform(X_test_cat)

X_train_fully_encoded=np.concatenate((X_train_encoded,train_num.values),axis=1)
X_test_fully_encoded=np.concatenate((X_test_encoded,test_num.values),axis=1)

In [180]:
# Standardize features
sc = StandardScaler()
X_train = sc.fit_transform(X_train_fully_encoded)
X_test = sc.fit_transform(X_test_fully_encoded)

In [181]:
# Fit Binary classification Classification to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train);

In [182]:
# Predicting the train set results
y_pred_train = classifier.predict(X_train)
df_train['binary_label']=y_pred_train
#Predicting the test set
y_pred_test = classifier.predict(X_test)
df_test['binary_label']=y_pred_test

### IV. B. Final classification

Right after we split the dataset in two parts we must classify the remaining part of the dataset

In [183]:
new_train = df_train[df_train['binary_label']==0].drop('binary_label',axis=1)
new_test = df_test[df_test['binary_label']==0].drop('binary_label',axis=1)
df_test.head(3)

Unnamed: 0_level_0,org,tld,ccs,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,day,month,hour,binary_label
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,iiitd,ac.in,4,multipart/alternative,1,15,1,0,50,9733,Sun,Aug,0.95,0
1,usief,org.in,0,multipart/alternative,0,45,0,0,50,127388,Tue,Oct,1.983333,0
2,entropay,com,0,multipart/alternative,1,6,1,0,40,4003,Tue,Nov,12.033333,0


It is time for a new hot encoding of the categorical data.

In [184]:
# Removing anoying features like day and month
new_train2 = new_train.drop(['day','month'],axis=1)
new_test2 = new_test.drop(['day','month'],axis=1)

X_train_cat = new_train2[['org','tld','mail_type']].values
X_train_num = new_train2.drop(['org','tld','mail_type','label'],axis=1).values

X_test_cat = new_test2[['org','tld','mail_type']].values
X_test_num = new_test2.drop(['org','tld','mail_type'],axis=1).values

In [185]:
encoder=OneHotEncoder(handle_unknown='ignore',sparse=False)
X_train_cat_encoded = encoder.fit_transform(X_train_cat)
X_train=np.concatenate((X_train_cat_encoded, X_train_num),axis=1)

X_test_cat_encoded = encoder.transform(X_test_cat)
X_test=np.concatenate((X_test_cat_encoded, X_test_num),axis=1)

y_train = new_train2['label'].values

In [186]:
# Fit Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train);

In [187]:
# Predict the Test set results
y_pred2 = classifier.predict(X_test)
#Upload every label on the data set
indice1,indice2=0,0
longueur1,longueur2=len(list(y_pred_test)),len(list(y_pred2))

res=longueur1*[-1]

while (indice1<longueur1) and (indice2<longueur2):
    if y_pred_test[indice1]==0:
        res[indice1]=y_pred2[indice2]
        indice2+=1
        indice1+=1
    else:
        res[indice1]=2
        indice1+=1

In [196]:
# Save results to submission file
y_pred = pd.DataFrame(res, columns=['label'])
y_pred.to_csv("../datasets/binary_classification_forest.csv", index=True, index_label='Id')

In [195]:
df_test['label']=y_pred
print(len([x for x in y_pred_test if x==0]))

10730
