In [4]:
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder

# Data processing

In [5]:
# Load the dataset
df_train = pd.read_csv('../datasets/new_train.csv', index_col=0)
# Keep only the relevant features
df_train=df_train[['org','tld','mail_type','label']]

We use the one hot encoder to transform the categorical features

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)

X = df_train.drop('label',axis=1).values
y = df_train['label'].values

X_encoded = encoder.fit_transform(X)

Split the dataset in order to avoid overfitting

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size = 0.2, random_state = 0)

Then proceed to some dimensionality reduction with LDA

In [10]:
lda = LDA(n_components=3)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test);



# Prediction

In [15]:
# Fit XGBoost to the Training set
classifier = XGBClassifier(colsample_bytree=0.8, gamma=0.5, max_depth=4, min_child_weight=1, subsample=0.8)
classifier.fit(X_train, y_train);

In [16]:
# Predict the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.9040686078978859

In [17]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[1837,   32,   39,  182],
       [  11,  690,    0,    0],
       [  25,    0,  991,    0],
       [ 182,    9,    1, 1015]], dtype=int64)

In [18]:
# Apply k-Fold Cross Validation
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()
accuracies

array([0.89730808, 0.90877368, 0.91425723, 0.91226321, 0.8998006 ,
       0.90827517, 0.90024938, 0.91521197, 0.90469062, 0.90809191])

According to a previous grid search, the best parameters were the following :
{'colsample_bytree': 0.8, 'gamma': 0.5, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.8}

However, nothing tells us without computing a new grid search that they still are the best. Anyway, it is still worth a shot...

# Prediction on the real test set

In [25]:
# Load the datasets
dataset_train = df_train.copy()
y_train = dataset_train['label'].values
dataset_train = dataset_train.drop('label',axis=1)
X_train = dataset_train.values

dataset_test = pd.read_csv('../datasets/new_test.csv', index_col=0)
X_test = dataset_test[['org','tld','mail_type']].values

In [26]:
# Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [27]:
# Standardize features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [28]:
# Fit XGBoost to the Training set
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [29]:
# Predict the Test set results
y_pred = classifier.predict(X_test)

In [30]:
# Save results to submission file
y_pred = pd.DataFrame(y_pred, columns=['label'])
y_pred.to_csv("../datasets/xgboost_enhanced.csv", index=True, index_label='Id')