In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict

from sklearn.linear_model import LogisticRegression

from scipy.stats import randint
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

from sklearn.decomposition import PCA

In [5]:
train = pd.read_csv('train.csv')
train_label = train['target']

X_train, X_test, y_train, y_test = train_test_split(train, train_label, test_size=0.2)

X_train = X_train.drop(['id', 'target'], axis=1)
X_test = X_test.drop(['id', 'target'], axis=1)

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

var=[99, 95, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10,9,8,7,6,5,4,3,2,1]
X_train_reduceds = []
X_test_reduceds = []
for i in var:
    globals()['pca%s' %i] = PCA(n_components=i/100)
    globals()['X_train_reduced%s' %i] = globals()['pca%s' %i].fit_transform(X_train)
    X_train_reduceds.append(globals()['X_train_reduced%s' %i])
    
    globals()['X_test_reduced%s' %i] = globals()['pca%s' %i].transform(X_test)
    X_test_reduceds.append(globals()['X_test_reduced%s' %i])
    
    print('pca'+str(i), globals()['pca%s' %i].n_components_)

pca99 177
pca95 144
pca90 121
pca85 106
pca80 93
pca75 82
pca70 73
pca65 65
pca60 58
pca55 51
pca50 45
pca45 39
pca40 33
pca35 28
pca30 24
pca25 19
pca20 15
pca15 11
pca10 7
pca9 6
pca8 6
pca7 5
pca6 4
pca5 4
pca4 3
pca3 2
pca2 2
pca1 1


In [6]:
lasso = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')

output = []

for i in range(len(X_train_reduceds)):
    lasso.fit(X_train_reduceds[i], y_train)
    y_pred = lasso.predict(X_test_reduceds[i])
    
    print('pca', var[i])
    print(confusion_matrix(y_test, y_pred))
    print(' accuracy:',accuracy_score(y_test, y_pred))
    print('precision:', precision_score(y_test, y_pred))
    print('recall:', recall_score(y_test, y_pred))
    print('roc_auc:', roc_auc_score(y_test, y_pred))
    print()
    
    output.append(('pca'+str(var[i]),roc_auc_score(y_test, y_pred) ))

pca 99
[[13  5]
 [14 18]]
 accuracy: 0.62
precision: 0.782608695652174
recall: 0.5625
roc_auc: 0.642361111111111

pca 95
[[13  5]
 [14 18]]
 accuracy: 0.62
precision: 0.782608695652174
recall: 0.5625
roc_auc: 0.642361111111111

pca 90
[[13  5]
 [14 18]]
 accuracy: 0.62
precision: 0.782608695652174
recall: 0.5625
roc_auc: 0.642361111111111

pca 85
[[13  5]
 [14 18]]
 accuracy: 0.62
precision: 0.782608695652174
recall: 0.5625
roc_auc: 0.642361111111111

pca 80
[[13  5]
 [15 17]]
 accuracy: 0.6
precision: 0.7727272727272727
recall: 0.53125
roc_auc: 0.626736111111111

pca 75
[[13  5]
 [13 19]]
 accuracy: 0.64
precision: 0.7916666666666666
recall: 0.59375
roc_auc: 0.6579861111111112

pca 70
[[13  5]
 [15 17]]
 accuracy: 0.6
precision: 0.7727272727272727
recall: 0.53125
roc_auc: 0.626736111111111

pca 65
[[13  5]
 [15 17]]
 accuracy: 0.6
precision: 0.7727272727272727
recall: 0.53125
roc_auc: 0.626736111111111

pca 60
[[13  5]
 [14 18]]
 accuracy: 0.62
precision: 0.782608695652174
recall: 0.5

In [7]:
output

[('pca99', 0.642361111111111),
 ('pca95', 0.642361111111111),
 ('pca90', 0.642361111111111),
 ('pca85', 0.642361111111111),
 ('pca80', 0.626736111111111),
 ('pca75', 0.6579861111111112),
 ('pca70', 0.626736111111111),
 ('pca65', 0.626736111111111),
 ('pca60', 0.642361111111111),
 ('pca55', 0.6736111111111112),
 ('pca50', 0.6579861111111112),
 ('pca45', 0.7048611111111112),
 ('pca40', 0.6458333333333334),
 ('pca35', 0.6145833333333334),
 ('pca30', 0.5989583333333334),
 ('pca25', 0.5868055555555556),
 ('pca20', 0.5434027777777778),
 ('pca15', 0.5434027777777778),
 ('pca10', 0.5),
 ('pca9', 0.5434027777777778),
 ('pca8', 0.5434027777777778),
 ('pca7', 0.5555555555555556),
 ('pca6', 0.5277777777777778),
 ('pca5', 0.5277777777777778),
 ('pca4', 0.5711805555555556),
 ('pca3', 0.5555555555555556),
 ('pca2', 0.5555555555555556),
 ('pca1', 0.6145833333333334)]

# Submission

In [8]:
train = pd.read_csv('train.csv')
train_label = train['target']
train = train.drop(['id', 'target'], axis=1)
train = StandardScaler().fit_transform(train)
train = pca65.transform(train)
lasso.fit(train, train_label)

test = pd.read_csv('test.csv').drop('id', axis=1)
test = StandardScaler().fit_transform(test)
test = pca65.transform(test)

submission = pd.read_csv('sample_submission.csv')
submission['target'] = lasso.predict_proba(test)[:,1]
submission.to_csv('submission/Lasso_PCA65.csv', index=False)

#### score : 0.674