In [84]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [87]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics, linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [86]:
data_path = 'data/'
df_X= pd.read_csv(data_path + 'kaggle/train.csv',header=None)
df_Y= pd.read_csv(data_path + 'kaggle/trainLabels.csv',header=None)
print(df_X.shape)
print(df_Y.shape)

(1000, 40)
(1000, 1)


In [88]:
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_X)

In [90]:
x_train, x_test, y_train, y_test = train_test_split(train_X, df_Y, test_size=0.2, random_state=3)

In [91]:
clf=linear_model.LogisticRegression()

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.76


In [92]:
clf=DecisionTreeClassifier()

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.7


In [93]:
clf=RandomForestClassifier(n_estimators=100, max_depth=4)

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.765


In [94]:
clf=GradientBoostingClassifier()

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.83


In [95]:
print("Feature importance: ", clf.feature_importances_)

Feature importance:  [1.65479498e-04 7.18462760e-03 5.84844251e-03 3.59442403e-03
 3.51619257e-02 1.10645168e-03 2.98707526e-02 2.67553774e-02
 3.98892309e-05 2.54760084e-03 4.16536003e-04 5.24529474e-03
 1.96770469e-01 2.12123367e-03 2.03022433e-01 0.00000000e+00
 1.94362463e-03 1.19973773e-03 8.15486244e-02 2.22248457e-03
 3.52721786e-03 2.26477289e-03 6.08490051e-03 1.98399190e-02
 1.63281269e-03 2.24060477e-03 2.88155733e-03 3.26841976e-04
 2.52199636e-02 6.66898060e-02 2.10437672e-03 6.41257841e-04
 5.38330382e-02 1.30130358e-03 3.27802235e-02 6.01250172e-04
 7.58961906e-02 3.15565478e-03 6.47379995e-03 8.57390996e-02]


In [96]:
# 設定要訓練的超參數組合
n_estimators = [100, 200, 300]
max_depth = [1, 3, 5]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   12.0s finished


In [97]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.877500 using {'max_depth': 5, 'n_estimators': 300}


In [98]:
grid_result.best_params_

{'max_depth': 5, 'n_estimators': 300}

In [99]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.85


In [100]:
print("Feature importance: ", clf_bestparam.feature_importances_)

Feature importance:  [0.00086101 0.00566878 0.00584024 0.00226461 0.05027552 0.00117055
 0.04458526 0.03203769 0.00586582 0.00050154 0.0042355  0.00710974
 0.18494549 0.00546941 0.16733433 0.0048617  0.00118039 0.00216623
 0.06890654 0.00244738 0.00303303 0.00378544 0.02144369 0.02040132
 0.00886217 0.00227246 0.00215015 0.00244416 0.02412176 0.06571474
 0.00392207 0.00128536 0.06754413 0.00232717 0.03213986 0.00213068
 0.06293017 0.00277395 0.0140043  0.05898566]


In [111]:
feature_select=pd.DataFrame({'n':range(40),'importance':clf_bestparam.feature_importances_})
feature_select.sort_values(by='importance',ascending=False).reset_index(drop=True)

Unnamed: 0,n,importance
0,12,0.184945
1,14,0.167334
2,18,0.068907
3,32,0.067544
4,29,0.065715
5,36,0.06293
6,39,0.058986
7,4,0.050276
8,6,0.044585
9,34,0.03214


In [102]:
list(feature_select.sort_values(by='importance',ascending=False).head(15).n)

[12, 14, 18, 32, 29, 36, 39, 4, 6, 34, 7, 28, 22, 23, 38]

In [116]:
x_train=pd.DataFrame(x_train)
x_test=pd.DataFrame(x_test)

In [117]:
x_train.loc[:,list(feature_select.sort_values(by='importance',ascending=False).head(15).n)]

Unnamed: 0,12,14,18,32,29,36,39,4,6,34,7,28,22,23,38
0,0.525029,0.416239,0.438073,0.597192,0.461479,0.611539,0.367315,0.449262,0.501852,0.552116,0.651291,0.398383,0.207878,0.439660,0.579843
1,0.622779,0.637566,0.778018,0.402877,0.493787,0.453617,0.772609,0.402819,0.732003,0.671263,0.727535,0.731926,0.521032,0.467842,0.394208
2,0.467955,0.275274,0.716980,0.667514,0.293919,0.680291,0.367072,0.567225,0.562425,0.340789,0.597609,0.429346,0.288164,0.558187,0.628669
3,0.468074,0.713115,0.530756,0.192826,0.383598,0.281435,0.271813,0.834908,0.507783,0.591903,0.233444,0.345726,0.855855,0.980768,0.366234
4,0.634029,0.731452,0.700371,0.454994,0.542052,0.507236,0.607679,0.422131,0.642625,0.646505,0.380671,0.702546,0.596023,0.651870,0.309606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.287402,0.302205,0.375960,0.676879,0.484784,0.289470,0.707572,0.261336,0.607160,0.402472,0.486995,0.478537,0.454404,0.219819,0.492605
796,0.499497,0.347351,0.675696,0.241406,0.598253,0.705361,0.698995,0.333039,0.793403,0.547409,0.557903,0.607365,0.208320,0.488037,0.448770
797,0.586226,0.684861,0.610791,0.372219,0.463622,0.548865,0.682585,0.479372,0.614495,0.623503,0.600521,0.555866,0.380741,0.489161,0.447643
798,0.687049,0.579489,0.619293,0.440572,0.401906,0.482336,0.595779,0.386592,0.689207,0.843836,0.555752,0.755211,0.539558,0.579198,0.520670


In [118]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
clf_bestparam.fit(x_train.loc[:,list(feature_select.sort_values(by='importance',ascending=False).head(15).n)], y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test.loc[:,list(feature_select.sort_values(by='importance',ascending=False).head(15).n)])

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)

Accuracy:  0.885


In [119]:
df_test= pd.read_csv(data_path + 'kaggle/test.csv',header=None)
print(df_test.shape)

(9000, 40)


In [120]:
scaler = MinMaxScaler()
df_test = scaler.fit_transform(df_test)
df_test=pd.DataFrame(df_test)

In [121]:
y_pred = clf_bestparam.predict(df_test.loc[:,list(feature_select.sort_values(by='importance',ascending=False).head(15).n)])

In [122]:
y_pred
print(y_pred.shape)

(9000,)


In [123]:
final=pd.DataFrame({'Id':range(1,9001),'Solution':y_pred})

In [124]:
final

Unnamed: 0,Id,Solution
0,1,1
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
8995,8996,1
8996,8997,1
8997,8998,1
8998,8999,0
