In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

from scipy.stats import randint
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

from sklearn.decomposition import PCA

In [4]:
train = pd.read_csv('train.csv')

split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train, train["target"].copy()):
    new_train = train.loc[train_index]
    test = train.loc[test_index]

In [6]:
new_train['target'].value_counts()

1.0    128
0.0     72
Name: target, dtype: int64

In [7]:
test['target'].value_counts()

1.0    32
0.0    18
Name: target, dtype: int64

In [None]:
new_train_label = new_train['target'].copy()
new_train = new_train.drop(['target', 'id'], axis=1)

test_label = test['target'].copy()
test = test.drop(['target', 'id'], axis=1)

In [8]:
def myRandomUnderSampler(X):
    y = X['target'].copy()
    X_resampled, y_resampled = RandomUnderSampler().fit_sample(X, y)
    
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
    for train_index, test_index in split.split(X_resampled, y_resampled):
        train_resampled, test_resampled = X_resampled[train_index], X_resampled[test_index]
    
    train_resampled = pd.DataFrame(data=train_resampled, index=train_resampled[:,0], columns=X.columns)
    train_resampled_label = train_resampled['target'].copy()
    test_resampled = pd.DataFrame(data=test_resampled, index=test_resampled[:,0], columns=X.columns)
    test_resampled_label = test_resampled['target'].copy()
    
    train_resampled = train_resampled.drop(['target', 'id'], axis=1)
    test_resampled = test_resampled.drop(['target', 'id'], axis=1)
    
    return train_resampled, train_resampled_label, test_resampled, test_resampled_label

def myScaler(dataframe):
    index = dataframe.index
    dataframe = pd.DataFrame(scale(dataframe)).set_index(index)
    return dataframe

def mySelector(X0, y0, X1, y1, X2, y2, X3, y3, X4, y4):
    param_grid = {'C' :[0.001, 0.01, 0.1, 1, 10, 100, 1000] }
    lg_clf = LogisticRegression(penalty='l1')
    grid_search = GridSearchCV(lg_clf, param_grid, cv=10, scoring='roc_auc',  n_jobs=-1)
    grid_search.fit(X0, y0)
    
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]

    new_features0 = []

    for i in range(len(coef)):
        if coef[i] != 0:
            new_features0.append(i)
    X0 = X0[new_features0]
    
    grid_search.fit(X1, y1)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features1 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features1.append(i)
    X1 = X1[new_features1]
    
    grid_search.fit(X2, y2)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features2 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features2.append(i)
    X2 = X2[new_features2]
    
    grid_search.fit(X3, y3)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features3 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features3.append(i)
    X3 = X3[new_features3]
    
    grid_search.fit(X4, y4)
    lg_clf_model = grid_search.best_estimator_
    coef = lg_clf_model.coef_[0]
    new_features4 = []
    for i in range(len(coef)):
        if coef[i] != 0:
            new_features4.append(i)
    X4 = X4[new_features4]
    
    common_features = list(set(new_features0) & set(new_features1) & set(new_features2) & set(new_features3) & set(new_features4))
    
    X0 = X0[common_features]
    X1 = X1[common_features]
    X2 = X2[common_features]
    X3 = X3[common_features]
    X4 = X4[common_features]
    
    
    return X0, X1, X2, X3, X4, common_features

In [9]:
train_resamples = []
train_label_resamples = []
test_resamples = []
test_label_resamples = []

for i in range(5):
    tmp = myRandomUnderSampler(new_train)
    train_resamples.append(tmp[0])
    train_label_resamples.append(tmp[1])
    test_resamples.append(tmp[2])
    test_label_resamples.append(tmp[3])
    
    train_resamples[i] = myScaler(train_resamples[i])
    test_resamples[i] = myScaler(test_resamples[i])
    
    globals()['train_resampled%s' %i] = train_resamples[i]
    globals()['train_resampled_label%s' %i] = train_label_resamples[i]
    globals()['test_resampled%s' %i] = test_resamples[i]
    globals()['test_resampled_label%s' %i] = test_label_resamples[i]

In [10]:
myselc = mySelector(train_resampled0, train_resampled_label0, 
                   train_resampled1, train_resampled_label1,
                   train_resampled2, train_resampled_label2,
                   train_resampled3, train_resampled_label3,
                   train_resampled4, train_resampled_label4)



In [11]:
train_resampled0 = myselc[0]
train_resampled1 = myselc[1]
train_resampled2 = myselc[2]
train_resampled3 = myselc[3]
train_resampled4 = myselc[4]


test_resampled0 = test_resampled0[myselc[5]]
test_resampled1 = test_resampled1[myselc[5]]
test_resampled2 = test_resampled2[myselc[5]]
test_resampled3 = test_resampled3[myselc[5]]
test_resampled4 = test_resampled4[myselc[5]]


# Model

In [12]:
param_grid = [
    {'max_depth' : [1,2,3,4,5,6,7,8,9,10],
    'n_estimators' : [100,200,300,400,500,600,700,800,900,1000]}    
]
### 0
rf0 = RandomForestClassifier()
grid_search = GridSearchCV(rf0, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(train_resampled0, train_resampled_label0)
best_rf0 = grid_search.best_estimator_




In [14]:
### 1
rf1 = RandomForestClassifier()
grid_search = GridSearchCV(rf1, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(train_resampled1, train_resampled_label1)
best_rf1 = grid_search.best_estimator_




In [16]:
voting_clf = VotingClassifier(
    estimators=[('0', best_rf0), ('1', best_rf1)],
    voting='soft')


for clf in (best_rf0, best_rf1, voting_clf):
    clf.fit(new_train, new_train_label)
    y_pred = clf.predict(test)
    print(clf.__class__.__name__, roc_auc_score(test_label, y_pred))

RandomForestClassifier 0.5
RandomForestClassifier 0.5
VotingClassifier 0.5


In [17]:
y_pred_voter = voting_clf.predict(test)

print("Result of Voter")
print()
print(confusion_matrix(test_label, y_pred_voter))
print('accuracy:', accuracy_score(test_label, y_pred_voter))
print('precision:', precision_score(test_label, y_pred_voter))
print('recall:', recall_score(test_label, y_pred_voter))

Result of Voter

[[ 0 18]
 [ 0 32]]
accuracy: 0.64
precision: 0.64
recall: 1.0


In [18]:
param_grid = [
    {'max_depth' : [1,2,3,4,5,6,7,8,9,10],
    'n_estimators' : [100,200,300,400,500,600,700,800,900,1000]}    
]
### 0
rf0 = RandomForestClassifier()
grid_search = GridSearchCV(rf0, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(train_resampled0, train_resampled_label0)
best_rf0 = grid_search.best_estimator_

### 1
rf1 = RandomForestClassifier()
grid_search = GridSearchCV(rf1, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(train_resampled1, train_resampled_label1)
best_rf1 = grid_search.best_estimator_

### 2
rf2 = RandomForestClassifier()
grid_search = GridSearchCV(rf2, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(train_resampled2, train_resampled_label2)
best_rf2 = grid_search.best_estimator_

### 3
rf3 = RandomForestClassifier()
grid_search = GridSearchCV(rf3, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(train_resampled3, train_resampled_label3)
best_rf3 = grid_search.best_estimator_

### 4
rf4 = RandomForestClassifier()
grid_search = GridSearchCV(rf4, param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(train_resampled4, train_resampled_label4)
best_rf4 = grid_search.best_estimator_


### voting
voting_clf = VotingClassifier(
    estimators=[('rf0', rf0), ('rf1', rf1),('rf2', rf2),('rf3', rf3),('rf4', rf4),],
    voting='soft')

voting_clf.fit(new_train, new_train_label)

y_pred_voter = voting_clf.predict(test)

print("Result of Voter")
print()
print(confusion_matrix(test_label, y_pred_voter))
print('accuracy:', accuracy_score(test_label, y_pred_voter))
print('precision:', precision_score(test_label, y_pred_voter))
print('recall:', recall_score(test_label, y_pred_voter))



Result of Voter

[[ 4 14]
 [ 1 31]]
accuracy: 0.7
precision: 0.6888888888888889
recall: 0.96875


In [19]:
roc_auc_score(test_label, y_pred_voter)

0.5954861111111112

In [20]:
### voting
voting_clf = VotingClassifier(
    estimators=[('rf0', best_rf0), ('rf1', best_rf1),('rf2', best_rf2),('rf3', best_rf3),('rf4', best_rf4),],
    voting='soft')

voting_clf.fit(new_train, new_train_label)

y_pred_voter = voting_clf.predict(test)

print("Result of Voter")
print()
print(confusion_matrix(test_label, y_pred_voter))
print('accuracy:', accuracy_score(test_label, y_pred_voter))
print('precision:', precision_score(test_label, y_pred_voter))
print('recall:', recall_score(test_label, y_pred_voter))

Result of Voter

[[ 0 18]
 [ 0 32]]
accuracy: 0.64
precision: 0.64
recall: 1.0


In [22]:
for clf in (best_rf0, best_rf1, best_rf2, best_rf3, best_rf4, voting_clf):
    clf.fit(new_train, new_train_label)
    y_pred = clf.predict(test)
    print(clf.__class__.__name__, accuracy_score(test_label, y_pred))

RandomForestClassifier 0.66
RandomForestClassifier 0.64
RandomForestClassifier 0.64
RandomForestClassifier 0.64
RandomForestClassifier 0.64
VotingClassifier 0.64


In [30]:
### voting
voting_clf = VotingClassifier(
    estimators=[('rf0', rf0), ('rf1', rf1),('rf2', rf2),('rf3', rf3),('rf4', rf4),],
    voting='soft')

voting_clf.fit(new_train, new_train_label)

y_pred_voter = voting_clf.predict(test)

print("Result of Voter")
print()
print(confusion_matrix(test_label, y_pred_voter))
print('accuracy:', accuracy_score(test_label, y_pred_voter))
print('precision:', precision_score(test_label, y_pred_voter))
print('recall:', recall_score(test_label, y_pred_voter))

Result of Voter

[[ 4 14]
 [ 2 30]]
accuracy: 0.68
precision: 0.6818181818181818
recall: 0.9375




In [31]:
for clf in (best_rf0, best_rf1, best_rf2, best_rf3, best_rf4, voting_clf):
    clf.fit(new_train, new_train_label)
    y_pred = clf.predict(test)
    print(clf.__class__.__name__, accuracy_score(test_label, y_pred))

RandomForestClassifier 0.64
RandomForestClassifier 0.64
RandomForestClassifier 0.64
RandomForestClassifier 0.64
RandomForestClassifier 0.64
VotingClassifier 0.68




# Submission

In [28]:
testset = pd.read_csv('test.csv')

testset_index = testset['id']
testset = testset.drop('id', axis=1)

testset = pd.DataFrame(scale(testset))

testset = testset.set_index(testset_index)
del testset.index.name

# testset = testset[myselc[5]]

d={'id': testset_index, 'target': voting_clf.predict_proba(testset)[:, [1]].flatten()}

submission = pd.DataFrame(data=d)
submission.to_csv('submission/underSampling_lasso_rf_voter1.csv', encoding='utf-8', index=False)


In [29]:
tmp = pd.read_csv('submission/underSampling_lasso_rf_voter1.csv')
tmp

Unnamed: 0,id,target
0,250,0.627952
1,251,0.640802
2,252,0.653273
3,253,0.681680
4,254,0.652733
5,255,0.631556
6,256,0.623819
7,257,0.632681
8,258,0.649049
9,259,0.626046


In [32]:
testset = pd.read_csv('test.csv')

testset_index = testset['id']
testset = testset.drop('id', axis=1)

testset = pd.DataFrame(scale(testset))

testset = testset.set_index(testset_index)
del testset.index.name

# testset = testset[myselc[5]]

d={'id': testset_index, 'target': voting_clf.predict_proba(testset)[:, [1]].flatten()}

submission = pd.DataFrame(data=d)
submission.to_csv('submission/underSampling_lasso_rf_voter1.csv', encoding='utf-8', index=False)


In [33]:
tmp = pd.read_csv('submission/underSampling_lasso_rf_voter1.csv')
tmp

Unnamed: 0,id,target
0,250,0.56
1,251,0.62
2,252,0.52
3,253,0.62
4,254,0.54
5,255,0.62
6,256,0.50
7,257,0.66
8,258,0.52
9,259,0.60
