In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

from scipy.stats import randint
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

from sklearn.decomposition import PCA

In [23]:
train = pd.read_csv('train.csv')

split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train, train["target"].copy()):
    new_train = train.loc[train_index]
    test = train.loc[test_index]

In [24]:
new_train['target'].value_counts()

1.0    128
0.0     72
Name: target, dtype: int64

In [25]:
test['target'].value_counts()

1.0    32
0.0    18
Name: target, dtype: int64

In [26]:
new_train_label = new_train['target'].copy()
test_label = test['target'].copy()

In [27]:
def myRandomUnderSampler(X):
    y = X['target'].copy()
    X_resampled, y_resampled = RandomUnderSampler().fit_sample(X, y)
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    for train_index, test_index in split.split(X_resampled, y_resampled):
        train_resampled, test_resampled = X_resampled[train_index], X_resampled[test_index]
    
    train_resampled = pd.DataFrame(data=train_resampled, index=train_resampled[:,0], columns=X.columns)
    train_resampled_label = train_resampled['target'].copy()
    test_resampled = pd.DataFrame(data=test_resampled, index=test_resampled[:,0], columns=X.columns)
    test_resampled_label = test_resampled['target'].copy()
    
    train_resampled = train_resampled.drop(['target', 'id'], axis=1)
    test_resampled = test_resampled.drop(['target', 'id'], axis=1)
    
    return train_resampled, train_resampled_label, test_resampled, test_resampled_label

def myScaler(dataframe):
    index = dataframe.index
    dataframe = pd.DataFrame(scale(dataframe)).set_index(index)
    return dataframe

def mySelector(X0, y0, X1, y1, X2, y2, X3, y3, X4, y4):
    param_grid = {'C' :[0.001, 0.01, 0.1, 1, 10, 100, 1000] }
    lg_clf = LogisticRegression(penalty='l1')
    grid_search = GridSearchCV(lg_clf, param_grid, cv=10, scoring='roc_auc',  n_jobs=-1)
    grid_search.fit(X0, y0)
    
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]

    new_features0 = []

    for i in range(len(coef)):
        if coef[i] != 0:
            new_features0.append(i)
    X0 = X0[new_features0]
    
    grid_search.fit(X1, y1)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features1 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features1.append(i)
    X1 = X1[new_features1]
    
    grid_search.fit(X2, y2)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features2 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features2.append(i)
    X2 = X2[new_features2]
    
    grid_search.fit(X3, y3)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features3 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features3.append(i)
    X3 = X3[new_features3]
    
    grid_search.fit(X4, y4)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features4 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features4.append(i)
    X4 = X4[new_features4]
    
    common_features = list(set(new_features0) & set(new_features1) & set(new_features2) & set(new_features3) & set(new_features4))
    
    X0 = X0[common_features]
    X1 = X1[common_features]
    X2 = X2[common_features]
    X3 = X3[common_features]
    X4 = X4[common_features]
    
    
    return X0, X1, X2, X3, X4, common_features

In [28]:
train_resamples = []
train_label_resamples = []
test_resamples = []
test_label_resamples = []

for i in range(5):
    tmp = myRandomUnderSampler(new_train)
    train_resamples.append(tmp[0])
    train_label_resamples.append(tmp[1])
    test_resamples.append(tmp[2])
    test_label_resamples.append(tmp[3])
    
    train_resamples[i] = myScaler(train_resamples[i])
    test_resamples[i] = myScaler(test_resamples[i])
    
    globals()['train_resampled%s' %i] = train_resamples[i]
    globals()['train_resampled_label%s' %i] = train_label_resamples[i]
    globals()['test_resampled%s' %i] = test_resamples[i]
    globals()['test_resampled_label%s' %i] = test_label_resamples[i]

In [29]:
myselc = mySelector(train_resampled0, train_resampled_label0, 
                   train_resampled1, train_resampled_label1,
                   train_resampled2, train_resampled_label2,
                   train_resampled3, train_resampled_label3,
                   train_resampled4, train_resampled_label4)



In [30]:
myselc[5]

[24, 65, 117, 33]

In [31]:
new_train_index = new_train.index
new_train = new_train.drop(['id', 'target'], axis=1)
new_train = new_train.values[:,myselc[5]]
new_train = pd.DataFrame(data = new_train, index = new_train_index, columns = myselc[5])

test_index = test.index
test = test.drop(['id', 'target'], axis=1)
test = test.values[:,myselc[5]]
test = pd.DataFrame(data = test, index = test_index, columns = myselc[5])

# Model

In [34]:
param_grid = [
    {'max_depth' : [1,2,3,4,5,6,7,8,9,10],
    'n_estimators' : [100,200,300,400,500,600,700,800,900,1000]}    
]

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(new_train, new_train_label)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [35]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print((mean_score), params)

0.7121634615384616 {'max_depth': 1, 'n_estimators': 100}
0.7562591575091574 {'max_depth': 1, 'n_estimators': 200}
0.7354395604395605 {'max_depth': 1, 'n_estimators': 300}
0.7466300366300367 {'max_depth': 1, 'n_estimators': 400}
0.7408676739926741 {'max_depth': 1, 'n_estimators': 500}
0.747614468864469 {'max_depth': 1, 'n_estimators': 600}
0.7456272893772895 {'max_depth': 1, 'n_estimators': 700}
0.7411423992673991 {'max_depth': 1, 'n_estimators': 800}
0.7444642857142858 {'max_depth': 1, 'n_estimators': 900}
0.7399793956043957 {'max_depth': 1, 'n_estimators': 1000}
0.7521634615384616 {'max_depth': 2, 'n_estimators': 100}
0.7648534798534798 {'max_depth': 2, 'n_estimators': 200}
0.7565407509157508 {'max_depth': 2, 'n_estimators': 300}
0.7635439560439561 {'max_depth': 2, 'n_estimators': 400}
0.7646428571428571 {'max_depth': 2, 'n_estimators': 500}
0.7651213369963369 {'max_depth': 2, 'n_estimators': 600}
0.7627129120879121 {'max_depth': 2, 'n_estimators': 700}
0.7637225274725274 {'max_depth'

In [36]:
grid_search.best_params_

{'max_depth': 2, 'n_estimators': 900}

In [38]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(test)

print("Result")
print()
print(confusion_matrix(test_label, y_pred))
print('accuracy:', accuracy_score(test_label, y_pred))
print('precision:', precision_score(test_label, y_pred))
print('recall:', recall_score(test_label, y_pred))

Result

[[ 4 14]
 [ 0 32]]
accuracy: 0.72
precision: 0.6956521739130435
recall: 1.0


In [43]:
roc_auc_score(test_label, y_pred)

0.6111111111111112

# Submission

In [41]:
testset = pd.read_csv('test.csv')

testset_index = testset['id']
testset = testset.drop('id', axis=1)

testset = pd.DataFrame(scale(testset))

testset = testset.set_index(testset_index)
del testset.index.name


testset = testset.values[:,myselc[5]]
testset = pd.DataFrame(data = testset, index = testset_index, columns = myselc[5])

d={'id': testset_index, 'target': best_rf.predict_proba(testset)[:, [1]].flatten()}

submission = pd.DataFrame(data=d)
submission.to_csv('submission/underSampling_lasso_rf2.csv', encoding='utf-8', index=False)

In [42]:
tmp = pd.read_csv('submission/underSampling_lasso_rf2.csv')
tmp

Unnamed: 0,id,target
0,250,0.696096
1,251,0.790772
2,252,0.546409
3,253,0.841171
4,254,0.667779
5,255,0.439447
6,256,0.538930
7,257,0.616853
8,258,0.828613
9,259,0.487411


#### Score : 0.718