In [67]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [68]:
# load data
train = pd.read_csv('../data/train_after_encode.csv')
test = pd.read_csv('../data/test_after_encode.csv')
protein_id = pd.read_csv('../data/protein_id.csv')
# define X,y
X_train = train.drop('y', axis=1)
y_train = train['y']
# create validation sets
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size = 0.3)

# one vs rest

## attempt 1: raw X_train

In [6]:
params = {
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty':['None','l1','l2','elasticnet'],
    'C':[100,10,1,0.1,0.01,0.001],
    'l1_ratio':[0,0.2,0.4,0.6,0.8,1]
}

In [112]:
# define model
ovr = LogisticRegression(multi_class='ovr')
# gridsearch
clfs = GridSearchCV(ovr, params, cv=10)
# fit model
clfs.fit(X_train, y_train)

In [135]:
#best score
clfs.best_score_

0.7018577920342154

In [136]:
#best parameter
clfs.best_params_

{'C': 10, 'l1_ratio': 0, 'penalty': 'l2', 'solver': 'lbfgs'}

In [137]:
#best estimator
best_ovr = clfs.best_estimator_

In [140]:
# save the best model
pd.to_pickle(best_ovr, '../best models/LR.pickle')

## section 2: consider feature importance

In [87]:
# save the coefficient of the best model
#best_clf_coef_t = pd.DataFrame(best_ovr.coef_)

In [90]:
# intepretation: the '0' column is the feature importance contributing to outcome 0, and so on
#best_clf_coef = best_clf_coef_t.T

0      0.493490
1      0.000000
2      0.000000
3      0.265601
4     -1.311055
         ...   
532   -0.428684
533   -0.839252
534    0.155636
535   -0.008009
536   -1.127341
Name: 0, Length: 537, dtype: float64

In [105]:
# # set index:
# # indices of X_train
# X_train_indices = X_train.columns
# best_clf_coef['index'] = X_train_indices
# # set index
# best_clf_coef.set_index('index')

In [102]:
# # save feature importance
# best_clf_coef.to_csv('logistic_reg_feature_importance.csv')

In [111]:
# # read feature importance
# fea_imp = pd.read_csv('logistic_reg_feature_importance.csv')
# fea_imp = fea_imp.drop('index.1', axis=1)
# fea_imp.set_index('index')

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0.493490,-0.208556,-0.775681,0.653828,-0.462872,0.392743,-1.411365,-1.452972e+00,0.362914,1.192880e-02,-1.034955e+00,-0.905222,-7.495101e-01,-6.238662e-01
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000e+00
5,0.265601,-0.075891,-0.785881,0.467792,-0.641171,-2.095073,-1.004156,2.111834e+00,-0.657020,4.971688e-01,-2.298225e-01,-0.210855,-1.385206e-01,-4.412683e-01
6,-1.311055,1.190044,-0.084539,-0.282648,-0.008957,-0.001383,-0.000268,-3.683999e-07,-0.000202,-1.316757e-04,-2.519637e-03,-0.001110,-2.260507e-05,-1.140655e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156,-0.428684,-0.189888,-0.003814,0.708720,-0.052409,-0.001954,-0.000280,-8.155174e-04,0.000345,-9.617542e-04,-6.059177e-03,-0.000084,-1.286416e-04,-8.020559e-04
2157,-0.839252,1.170903,-0.000197,-0.000188,-0.233157,-0.000057,-0.000082,-7.341154e-07,-0.000003,-8.714626e-07,4.886900e-07,0.000001,1.449317e-07,-3.741966e-07
2163,0.155636,-0.126616,-0.190837,-0.015470,-0.001869,-0.007866,-0.000103,4.915722e-07,-0.059973,-1.923618e-02,-2.284416e-03,-0.003865,-9.773096e-05,-7.684161e-05
2169,-0.008009,-0.046761,-0.000080,0.105646,-0.003297,-0.000011,-0.164721,7.255349e-08,-0.002513,-5.952695e-05,-2.567012e-06,0.000001,2.084624e-07,3.008105e-07


In [None]:
# # write an array to score important features for each 
# fea_num = 14

In [52]:
# # X_train with features that have negative importance removed
# X_train_pos_imp = X_train[imp_no_neg_ind]
# X_val_pos_imp = X_val[imp_no_neg_ind]

In [55]:
# make predictions
# y_pred_val = best_ovr_2.predict(X_val_pos_imp)
# y_pred_val

array([ 9,  1,  0,  0,  1,  1,  5,  2,  0,  0,  0,  0,  0,  1,  1,  0,  4,
        1,  1,  6,  5,  0,  2,  0,  0,  0,  3,  1,  6,  1,  0,  1,  0,  4,
        0,  2,  4,  1,  1,  1,  0,  0,  6,  3,  1,  0,  0,  0,  0,  5,  0,
        3,  1,  2,  0,  3,  0,  3,  1,  3,  0,  3,  8,  0,  0,  0,  0,  0,
        1,  3,  0,  4,  0,  0,  0,  1,  3,  9,  0,  1,  1,  2,  1,  1,  0,
        1,  3,  1,  0,  1,  2,  0,  0,  0,  0,  0,  0,  7,  4,  7,  3,  0,
        1,  0,  0,  0,  1,  2,  5,  8,  8,  1,  1,  0,  5,  0,  0,  1,  0,
        0,  3,  0,  1,  2,  0,  6,  0,  0,  7,  0,  1,  1,  0,  2,  3,  4,
        2,  0,  6,  0,  0,  0,  1,  0,  0,  1,  2,  1,  2,  0,  3,  0,  0,
        1,  0,  1,  0,  1,  3,  0,  4,  0,  1,  0,  5,  0,  0,  0,  0,  0,
        0,  0,  1,  1,  0,  3,  0,  0,  1,  0,  0,  4,  0,  1,  8,  0,  5,
        1,  0,  0,  0,  0,  0,  8,  0,  7,  1,  1,  1,  1,  0,  2,  8,  0,
        2,  0,  1,  6,  0,  6,  1,  1,  8,  0,  3,  0,  0,  1,  5,  0,  1,
        6,  0,  0, 11,  1

In [56]:
# accuracy score
#accuracy_score(y_val, y_pred_val)

0.694980694980695

# one vs one(not good)

In [None]:
# # define model
# ovo = svm.SVC(decision_function_shape='ovo')
# # fit model
# ovo.fit(X_train, y_train)

In [None]:
# # make predictions
# y_pred_val = ovo.predict(X_val)
# y_pred_val

In [None]:
# # accuracy score
# accuracy_score(y_val, y_pred_val)

## section 3: save the submission file

In [12]:
# creating the result vector
y_pred_test = best_ovr.predict(test).astype(int)

In [13]:
y_pred_test

array([ 1,  0,  0,  3,  5,  1,  0,  0,  5,  2,  4,  1,  3,  1,  0,  0,  0,
        5,  0,  0,  0,  3,  1,  0,  7,  0,  1,  0,  1,  0,  0,  0,  0,  0,
        0,  1,  2,  1, 10,  1,  0,  0,  0,  0,  0,  1,  1,  7,  0,  0,  6,
        0,  5,  0,  5,  0,  1,  4,  1,  0,  1,  0,  0,  3,  1,  0,  4,  5,
        0,  1,  0,  5,  0,  1,  0,  3,  1,  1,  0,  0,  3,  0,  1,  0,  0,
        0,  2,  0,  3,  2,  0,  0,  5,  0,  0,  3,  3,  3,  3,  1,  0,  0,
        1,  0,  3,  4,  4,  0,  1,  1,  6,  0,  3,  4,  0,  0,  6,  0,  0,
        3,  0,  0,  1,  0,  0,  0,  0,  2,  0,  1,  0,  1,  0,  5,  4,  3,
        1,  2,  1,  1,  0,  8,  5,  0,  2,  8,  5,  3,  6,  3,  1,  1, 14,
        2,  0,  1,  1,  0,  0,  0,  0,  2,  0,  1,  0,  0,  0,  7,  0,  0,
        3,  0,  5,  3,  6,  0,  1,  3,  0,  6,  0,  7,  0,  1,  0,  3,  0,
        1,  0,  6,  0,  1,  0,  1,  4,  3,  1,  0,  7,  2,  2,  0,  2,  0,
        2,  0,  3,  1,  4,  0,  0,  0,  2,  0,  8,  0,  5,  0,  0,  0,  1,
        7,  1,  1,  8,  2

In [14]:
# create result
res = {'key':protein_id['0'], 'label':y_pred_test}
res = pd.DataFrame(res)
# set id to be index
res = res.set_index('key')
# sort by index
res = res.sort_index()

In [15]:
res

Unnamed: 0_level_0,label
key,Unnamed: 1_level_1
P234062,2
P234081,0
P234086,2
P234087,0
P234094,0
...,...
P240380,0
P240407,2
P240440,1
P240462,2


In [16]:
res.to_csv('../result/submission_logistic.csv')