In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

### Data Preprocess & Feature Engineering

In [2]:
x_train = pd.read_csv("data/train.csv", header=None)
x_test = pd.read_csv("data/test.csv", header=None)
y_train = pd.read_csv("data/trainLabels.csv", header=None)
print(x_train.shape)
print(x_test.shape)
y_train

(1000, 40)
(9000, 40)


Unnamed: 0,0
0,1
1,0
2,0
3,1
4,0
...,...
995,0
996,1
997,1
998,0


In [3]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1000 non-null   float64
 1   1       1000 non-null   float64
 2   2       1000 non-null   float64
 3   3       1000 non-null   float64
 4   4       1000 non-null   float64
 5   5       1000 non-null   float64
 6   6       1000 non-null   float64
 7   7       1000 non-null   float64
 8   8       1000 non-null   float64
 9   9       1000 non-null   float64
 10  10      1000 non-null   float64
 11  11      1000 non-null   float64
 12  12      1000 non-null   float64
 13  13      1000 non-null   float64
 14  14      1000 non-null   float64
 15  15      1000 non-null   float64
 16  16      1000 non-null   float64
 17  17      1000 non-null   float64
 18  18      1000 non-null   float64
 19  19      1000 non-null   float64
 20  20      1000 non-null   float64
 21  21      1000 non-null   float64
 22  2

### Logistic Regression

In [4]:
penalty = ["l1", "l2"]
C = [0.001, 0.01, 0.1, 1, 10]

param_grid = dict(penalty=penalty, C=C)
lr = LogisticRegression()
grid_search = GridSearchCV(lr, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Accuracy: 0.830000 using {'C': 0.01, 'penalty': 'l2'}


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\chen2\anaconda3\envs\MachineLearning\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\chen2\anaconda3\envs\MachineLearning\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\chen2\anaconda3\envs\MachineLearning\lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\chen2\anaconda3\envs\Mach

In [5]:
lr_best = LogisticRegression(penalty=grid_result.best_params_["penalty"],
                             C=grid_result.best_params_["C"])
lr_best.fit(x_train, y_train)
y_pred_lr = lr_best.predict(x_test)

submission = {'Id': np.arange(1, y_pred_lr.shape[0]+1),
             'Solution': y_pred_lr}
submission = pd.DataFrame(submission)
submission.to_csv('output/lr_submission.csv', index=False)

  y = column_or_1d(y, warn=True)


### Random Forest

In [6]:
n_estimators = [100, 200, 300, 400, 500]
max_depth = [10, 20, 30, 40]

param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
rfc = RandomForestClassifier()
grid_search = GridSearchCV(rfc, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Accuracy: 0.876000 using {'max_depth': 20, 'n_estimators': 200}


In [7]:
rfc_best = RandomForestClassifier(n_estimators=grid_result.best_params_["n_estimators"],
                                 max_depth=grid_result.best_params_["max_depth"])
rfc_best.fit(x_train, y_train)
y_pred_rfc = rfc_best.predict(x_test)

submission = {'Id': np.arange(1, y_pred_rfc.shape[0]+1),
             'Solution': y_pred_rfc}
submission = pd.DataFrame(submission)
submission.to_csv('output/rfc_submission.csv', index=False)

  return fit_method(estimator, *args, **kwargs)


### Gradient Boosting Machine

In [8]:
learning_rate = [0.001, 0.01, 0.1, 1, 10]
n_estimators = [100, 200, 300, 400, 500]

param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
gbc = GradientBoostingClassifier()
grid_search = GridSearchCV(gbc, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train, y_train)

print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 25 candidates, totalling 125 fits


  y = column_or_1d(y, warn=True)


Best Accuracy: 0.876000 using {'learning_rate': 0.1, 'n_estimators': 300}


In [9]:
gbc_best = GradientBoostingClassifier(learning_rate=grid_result.best_params_["learning_rate"],
                                     n_estimators=grid_result.best_params_["n_estimators"])
gbc_best.fit(x_train, y_train)
y_pred_gbc = gbc_best.predict(x_test)

submission = {'Id': np.arange(1, y_pred_gbc.shape[0]+1),
             'Solution': y_pred_gbc}
submission = pd.DataFrame(submission)
submission.to_csv('output/gbm_submission.csv', index=False)

  y = column_or_1d(y, warn=True)


### K-Nearest Neighbours

In [10]:
n_neighbors = list(range(1,51))
param_grid = dict(n_neighbors=n_neighbors)
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(x_train.values, y_train)

print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Accuracy: 0.906000 using {'n_neighbors': 3}


  return self._fit(X, y)


In [11]:
knn_best = KNeighborsClassifier(n_neighbors=grid_result.best_params_["n_neighbors"])
knn_best.fit(x_train, y_train)
y_pred_knn = knn_best.predict(x_test.values)

submission = {'Id': np.arange(1, y_pred_knn.shape[0]+1),
             'Solution': y_pred_knn}
submission = pd.DataFrame(submission)
submission.to_csv('output/knn_submission.csv', index=False)

  return self._fit(X, y)
