![](https://i.imgur.com/jMbXRzf.png)


In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
train_data = pd.read_csv('./data/train_data.csv')
test_x = pd.read_csv('./data/test_features.csv')
train_x = train_data.drop('poi', axis = 1)
train_y = train_data['poi'].map(lambda x:1 if (x==True) else 0)

In [2]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

(113, 21)
(113,)
(33, 21)


In [3]:
# 對 name, email_address 做特徵雜湊
name = test_x['name']

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
train_x['name'] = LabelEncoder().fit_transform(train_x['name'])
train_x['name'] = train_x['name'].map(lambda x:hash(x)%10)
test_x['name'] = LabelEncoder().fit_transform(test_x['name'])
test_x['name'] = test_x['name'].map(lambda x:hash(x)%10)

In [4]:
# drop loan_advance
deferred_income = train_x['deferred_income']
deferred_income_test = test_x['deferred_income']

train_x = train_x.drop(['loan_advances', 'email_address','deferred_income'],axis=1)
test_x = test_x.drop(['loan_advances', 'email_address','deferred_income'],axis=1)

for col in train_x.columns:
    median = train_x[col][train_x[col].notnull()].median()
    train_x[col] = train_x[col].fillna(median)
    train_x[col] = train_x[col].map(lambda x:np.log(x) if x > 0 else 0)
    median_test = test_x[col][test_x[col].notnull()].median()
    test_x[col] = test_x[col].fillna(median_test)
    test_x[col] = test_x[col].map(lambda x:np.log(x) if x > 0 else 0)
    
train_x['deferred_income'] = deferred_income
test_x['deferred_income'] = deferred_income_test

In [5]:
# deferred_income
median = train_x['deferred_income'][train_x['deferred_income'].notnull()].median()
train_x['deferred_income'] = train_x['deferred_income'].fillna(median)
train_x['deferred_income'] = train_x['deferred_income'].map(lambda x:-np.log(-x) if x < 0 else 0)

median = test_x['deferred_income'][test_x['deferred_income'].notnull()].median()
test_x['deferred_income'] = test_x['deferred_income'].fillna(median)
test_x['deferred_income'] = test_x['deferred_income'].map(lambda x:-np.log(-x) if x < 0 else 0)

# for c in train_x.columns:
#     train_x[c] = MinMaxScaler().fit_transform(train_x[c].values.reshape(-1,1))
#     test_x[c] = MinMaxScaler().fit_transform(test_x[c].values.reshape(-1,1))

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

In [7]:
# Decision Tree
max_depth = [3,5,7,10,15,20]
clf_D = DecisionTreeClassifier()
param_grid = dict(max_depth=max_depth)
grid_search = GridSearchCV(clf_D, param_grid, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(train_x, train_y)
print(f'best score:{grid_result.best_score_}\nbest params:{grid_result.best_params_}')

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


best score:0.8495575221238938
best params:{'max_depth': 3}


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    2.5s finished


In [8]:
clf_D = grid_result.best_estimator_
clf_D.fit(train_x,train_y)
pred_y_D = clf_D.predict_proba(test_x)[:,1]
pred_y = pd.DataFrame({'name':name, 'poi':pred_y_D})
pred_y.to_csv('./data/pred_y_DecisionTree.csv', index=False)
print(f'cross_val_score:{cross_val_score(clf_D,train_x,train_y,cv=5).mean()}')

cross_val_score:0.8138339920948617


In [9]:
# RandomForest
max_depth = [3,5,7,10,15,20]
n_estimator = [10,50,100,300,500]
param_grid = dict(n_estimators=n_estimator,max_depth=max_depth)
clf_R = RandomForestClassifier()
grid_search = GridSearchCV(clf_R, param_grid, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(train_x,train_y)
print(f'best score:{grid_result.best_score_}\nbest params:{grid_result.best_params_}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 30 candidates, totalling 90 fits
best score:0.911504424778761
best params:{'max_depth': 10, 'n_estimators': 300}


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    2.5s finished


In [10]:
clf_R = grid_result.best_estimator_
clf_R.fit(train_x, train_y)
pred_y_R = clf_R.predict_proba(test_x)[:,1]
pred_y = pd.DataFrame({'name':name, 'poi':pred_y_R})
pred_y.to_csv('./data/pred_y_RandomForest.csv', index=False)
print(f'cross_val_score:{cross_val_score(clf_R,train_x,train_y,cv=5).mean()}')

cross_val_score:0.8936758893280633


In [11]:
# GradientBoosting
max_depth = [3,5,7,10,15,20]
n_estimator = [10,50,100,300,500]
param_grid = dict(n_estimators=n_estimator,max_depth=max_depth)
clf_G = GradientBoostingClassifier()
grid_search = GridSearchCV(clf_G, param_grid, n_jobs=-1, verbose=1)
grid_result = grid_search.fit(train_x,train_y)
print(f'best score:{grid_result.best_score_}\nbest params:{grid_result.best_params_}')

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


best score:0.8761061946902655
best params:{'max_depth': 3, 'n_estimators': 10}


[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    1.0s finished


In [12]:
clf_G = grid_result.best_estimator_
clf_G.fit(train_x, train_y)
pred_y_G = clf_G.predict_proba(test_x)[:,1]
pred_y = pd.DataFrame({'name':name, 'poi':pred_y_G})
pred_y.to_csv('./data/pred_y_GradientBoost.csv', index=False)
print(f'cross_val_score:{cross_val_score(clf_G,train_x,train_y,cv=5).mean()}')

cross_val_score:0.849802371541502


In [13]:
# blending
RF_weight = 0.74285 / (0.74285 + 0.76428)
GB_weight = 0.76428 / (0.74285 + 0.76428)
pred_y_bl = pred_y_R*RF_weight + pred_y_G*GB_weight
pred_y = pd.DataFrame({'name':name, 'poi':pred_y_bl})
pred_y.to_csv('./data/pred_y_Blending.csv', index=False)

In [14]:
# stacking
from mlxtend.classifier import StackingClassifier

meta_estimator = GradientBoostingClassifier(tol=100, subsample=0.70, n_estimators=50, 
                                           max_features='sqrt', max_depth=4, learning_rate=0.3)

stacking = StackingClassifier(classifiers=[clf_R,clf_G], meta_classifier=meta_estimator)
for clf in [clf_R, clf_G, stacking]:
    print(f'score:{cross_val_score(clf, train_x, train_y,cv=5).mean()}')

score:0.8849802371541502
score:0.849802371541502
score:0.8845849802371542


In [15]:
stacking.fit(train_x, train_y)
pred_y_stacking = stacking.predict_proba(test_x)[:,1]
pred_y = pd.DataFrame({'name':name, 'poi':pred_y_stacking})
pred_y.to_csv('./data/pred_y_stacking.csv', index=False)