# Test of XGBoost Model

In [1]:
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
# Load the dataset
dataset = pd.read_csv('datasets/new_train.csv', index_col=0)
X = dataset.drop('label',axis=1).values
y = dataset['label'].values

In [None]:
# Standardize features
sc = StandardScaler()
X = sc.fit_transform(X)

In [31]:
# Split the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Run the following cell to apply LDA on the dataset

Not mandatory !

In [35]:
# Apply LDA
lda = LDA(n_components=3)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

[[ 1.13600633 -0.43873608]
 [-0.61014017  0.03926328]
 [ 1.98930485 -1.24496062]
 ...
 [-0.20070735 -0.35223934]
 [-0.62181632  1.33441627]
 [-0.90426524 -2.14384704]]




# Run the following cell to apply PCA on the dataset

Not mandatory !

In [63]:
# Apply PCA
pca = PCA(n_components=10)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

# Prediction

In [32]:
# Fit XGBoost to the Training set
classifier = XGBClassifier(colsample_bytree=0.8, gamma=1, max_depth=5, min_child_weight=1, subsample=0.6)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.6, verbosity=1)

In [33]:
# Predict the Test set results
y_pred = classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.919425608296769

The model seems to be more accurate without applying any dimensionality reduction...

In [34]:
# Make the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

In [35]:
# Apply k-Fold Cross Validation
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies.mean()
accuracies.std()
print(accuracies)

[0.89930209 0.92023928 0.93117207 0.9201995  0.91122195 0.919202
 0.92269327 0.92418953 0.92169576 0.92069825]


# Apply Grid Search to XGBoost

In [23]:
# Load the dataset
dataset = pd.read_csv('datasets/new_train.csv', index_col=0)
X_train = dataset.drop('label',axis=1).values
y_train = dataset['label'].values

In [24]:
# Standardize features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

In [25]:
# Fit XGBoost to the Training set
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [27]:
# Apply Grid Search to find the best model and the best parameters
parameters = {'min_child_weight': [1],
          'gamma': [0.5, 1, 1.5],
          'subsample': [0.6],
          'colsample_bytree': [0.8],
          'max_depth': [5]
          }
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

0.9218463322648699
{'colsample_bytree': 0.8, 'gamma': 1, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6}


0.9044121312678849
{'colsample_bytree': 0.8, 'gamma': 1.5, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.6}

0.9218463322648699
{'colsample_bytree': 0.8, 'gamma': 1, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6}

# Prediction on Test set

In [46]:
# Load the dataset
dataset_train = pd.read_csv('datasets/new_train.csv', index_col=0)
dataset_test = pd.read_csv('datasets/new_test.csv', index_col=0)
X_train = dataset_train.drop('label',axis=1).values
y_train = dataset_train['label'].values
X_test = dataset_test.values

In [47]:
# Standardize features
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [48]:
# Fit XGBoost to the Training set
classifier = XGBClassifier(colsample_bytree=0.8, gamma=1, max_depth=5, min_child_weight=1, subsample=0.6)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.6, verbosity=1)

In [49]:
# Predict the Test set results
y_pred = classifier.predict(X_test)

In [50]:
# Save results to submission file
y_pred = pd.DataFrame(y_pred, columns=['label'])
y_pred.to_csv("datasets/xgboost_submission.csv", index=True, index_label='Id')