In [87]:
import pickle as pkl
import pandas as pd
import csv
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, mean_squared_error,roc_auc_score

In [7]:
def read_from_pkl(path):
    with open(path,'rb') as f:
        data = pkl.load(f)
    return data

In [8]:
def save_as_pkl(data,path):
    with open(path,'wb') as f:
        pkl.dump(data,f,protocol=pkl.HIGHEST_PROTOCOL)
    print(path,'saved..')

In [9]:
def save_as_csv(result,path):
    headers = ['id','Predicted']
    with open(path, 'w', encoding = 'utf8') as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(result)
    print(path,'saved...')

In [10]:
train_data1 = read_from_pkl('./data/train_19_features.pkl')
train_data2 = read_from_pkl('./data/train_13_features.pkl')
train_data1 = pd.DataFrame(train_data1).T
train_data2 = pd.DataFrame(train_data2).T
labels = train_data1.iloc[:,0]

In [119]:
#train1 = train_data1.iloc[:,3:]
#train2 = train_data2.iloc[:,2:]
#samples = pd.concat([train1,train2],axis=1,ignore_index=True)
samples = train_data2.iloc[:,3:]
samples.iloc[:5]

Unnamed: 0,3,4,5,6,7,8,9,10,11,12,13,14
train-0,20.7752,0.00887762,0.504525,1.70529,6355.5,18.4085,0.252094,0.322956,0.357477,0.0,230.316,0.0177552
train-1,11.1955,0.0049507,0.269746,0.915855,5785.5,9.28548,0.415673,0.0553785,0.0912004,0.264945,1381.89,0.108107
train-2,20.7846,0.00892449,0.505882,1.73502,4902.0,18.4105,0.252614,0.32299,0.357548,0.0,230.316,0.017849
train-3,0.253102,0.000106213,0.00488729,0.025359,314247.0,0.105516,41.0383,0.00011231,0.0178255,0.0839517,684268.0,18.3549
train-4,40.6251,0.0175439,1.0,3.37639,3249.0,36.3094,0.245246,0.637007,0.637007,0.0,115.158,0.0175439


In [111]:
# samples = samples.drop([5,9,10,11,12,13,14,15,17,18,19,21,22,23,26],axis=1) # rm all+in
# samples = samples.drop([5,17,18,19,20,21,22,26],axis=1) # rm in
#samples = samples.drop([17,18,19,20,21,22],axis=1) # rm in

In [120]:
samples.shape

(360000, 12)

In [113]:
'''
samples = samples.astype(float)
samples[[0,1,2,3,4,5,6,7,8]] = samples[[0,1,2,3,4,5,6,7,8]].astype(int)
labels = labels.astype(int)
print('x:',samples.head(3))
print('y:',labels.head(3))
'''

x:          0   1   2   3   4   5   6   7   8         9   ...     13    14  \
train-0  57   5  57   2   2   0  59   7  57  0.000000  ...  114.0  59.0   
train-1  57   5  57  12   4  12  69   9  69  0.825197  ...  684.0  80.0   
train-2  57   5  57   2   2   0  59   7  57  0.000000  ...  114.0  59.0   

               15        16        23        24        25        26  \
train-0  0.000000  0.000000  0.252094  0.322956  0.357477  0.000000   
train-1  0.340852  0.002924  0.415673  0.055379  0.091200  0.264945   
train-2  0.000000  0.000000  0.252614  0.322990  0.357548  0.000000   

                  27        28  
train-0   230.315789  0.017755  
train-1  1381.894737  0.108107  
train-2   230.315789  0.017849  

[3 rows x 23 columns]
y: train-0    1
train-1    1
train-2    1
Name: 0, dtype: int64


In [47]:
# feature selection
#from sklearn.feature_selection import VarianceThreshold
#sel = VarianceThreshold(threshold=(.6 * (1 - .6)))
#sel.fit_transform(samples)

array([[5.70000000e+01, 5.00000000e+00, 5.70000000e+01, ...,
        2.52093830e-01, 2.30315789e+02, 1.77552315e-02],
       [5.70000000e+01, 5.00000000e+00, 5.70000000e+01, ...,
        4.15672665e-01, 1.38189474e+03, 1.08106876e-01],
       [5.70000000e+01, 5.00000000e+00, 5.70000000e+01, ...,
        2.52614066e-01, 2.30315789e+02, 1.78489703e-02],
       ...,
       [3.70200000e+03, 1.95000000e+02, 3.70000000e+03, ...,
        5.39170399e-02, 4.45814703e+03, 1.11008130e-05],
       [3.70200000e+03, 1.95000000e+02, 3.70000000e+03, ...,
        2.79430115e-02, 4.45814703e+03, 9.36774002e-07],
       [3.70200000e+03, 1.95000000e+02, 3.70000000e+03, ...,
        6.44325790e-02, 1.56035146e+04, 1.98487023e-05]])

In [121]:
samples.shape

(360000, 12)

In [122]:
# split train & val
train_x,val_x,train_y,val_y = train_test_split(samples,labels,test_size=0.3)

In [123]:
print('train shape:',train_x.shape)
print('val shape:',val_x.shape)

train shape: (252000, 12)
val shape: (108000, 12)


In [124]:
# standardize
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
val_x = scaler.transform(val_x)
print(train_x[0])
print(val_x[0])

[-0.03051135 -0.13801379 -0.39404723 -0.36375751 -0.05176578 -0.020069
  0.00937153 -0.30506242  0.23281982  0.9845342  -0.02635507 -0.01471842]
[-0.02859016 -0.23680862 -0.42811346 -0.37857345 -0.03736607 -0.01986113
 -0.03368085 -0.29252826 -0.52334517 -0.24608051 -0.02878386 -0.02696611]


In [125]:
# grid search cv
pipeline = Pipeline([
    ('clf', RandomForestRegressor(criterion='mse'))
])
parameters = {
       'clf__n_estimators': [100],
       #'clf__max_depth': [50],
       #'clf__min_samples_split': [1000],
       #'clf__min_samples_leaf': [1000],
}
grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,cv=3,scoring='roc_auc')
grid_search.fit(train_x,train_y)
print('Best score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))
save_as_pkl(best_parameters,'./rf_best_parameters_12.pkl')

val_pred = grid_search.predict(val_x)
fpr, tpr, thresholds = roc_curve(val_y, val_pred, pos_label=1)
print('mean_squared_error:', mean_squared_error(val_y, val_pred))
print('Auc:',auc(fpr, tpr))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  1.8min finished
Best score: 1.000
Best parameters set:
	clf__n_estimators: 100
./rf_best_parameters_12.pkl saved..
mean_squared_error: 0.0009806898148148149
Auc: 0.9997740643007227


In [136]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(train_x,train_y)


RandomForestClassifier()

In [140]:
val_pred = rf.predict_proba(val_x)[:,1]
auc = roc_auc_score(val_y,val_pred)
print('auc:',auc)

auc: 0.9999309623962607


In [100]:
# prediction
test_data1 = read_from_pkl('./data/test_19_features.pkl')
test_data2 = read_from_pkl('./data/test_13_features.pkl')
test_data1 = pd.DataFrame(test_data1).T
test_data2 = pd.DataFrame(test_data2).T


In [77]:
test_data1.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
1,,3563811,3600160,23,3,21,29,29,0,52,32,21,0.0,47.7849,0.0,0.0,667,27,0.0
2,,2052043,1401960,74,13,71,9,9,0,83,22,71,0.0,49.5286,0.0,0.0,666,31,0.0
3,,4517994,1690636,255,80,205,17,17,0,272,97,205,0.462508,356.919,0.0111524,0.0508183,4335,356,0.00462379
4,,1660006,4349447,511,32,506,36,36,0,547,68,506,0.178805,1223.15,0.00366972,0.0148185,18396,1847,2.80927e-05
5,,581111,1882617,21,5,18,46,46,0,67,51,18,0.0,75.4785,0.0,0.0,966,54,0.0


In [24]:
test_data2.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
1,3563811,3600160,0.0,0.166192,3.06738e-05,0.00059617,0.0113831,1013070.0,0.0234397,0.139785,0.00110861,0.00475634,0,7611.81,0.000973567
2,2052043,1401960,0.0,0.471572,0.000228642,0.00788548,0.0599194,191422.0,0.00362802,0.0680789,5.10988e-05,0.000454739,0,9233.75,0.00214473
3,4517994,1690636,0.000692042,3.45737,0.000186637,0.0242947,0.146893,1569870.0,0.117964,0.206271,0.000554185,0.0069107,0,12038.8,0.00319814
4,1660006,4349447,0.000108719,8.61822,7.29712e-06,0.00315172,0.082303,18462300.0,0.321158,0.336062,0.00063456,0.0142708,0,141540.0,0.000248849
5,581111,1882617,0.0,0.183293,9.72129e-07,2.04037e-05,0.0078426,1158810.0,0.00601201,0.235388,0.000329829,0.0063242,0,21287.8,4.65595e-05


In [141]:
test1 = test_data1.iloc[:,3:]
test2 = test_data2.iloc[:,2:]
#test_x = pd.concat([test1,test2],axis=1,ignore_index=True)
test_x = test_data2.iloc[:,3:]
test_x.iloc[:5]

Unnamed: 0,3,4,5,6,7,8,9,10,11,12,13,14
1,0.166192,3.06738e-05,0.00059617,0.0113831,1013070.0,0.0234397,0.139785,0.00110861,0.00475634,0,7611.81,0.000973567
2,0.471572,0.000228642,0.00788548,0.0599194,191422.0,0.00362802,0.0680789,5.10988e-05,0.000454739,0,9233.75,0.00214473
3,3.45737,0.000186637,0.0242947,0.146893,1569870.0,0.117964,0.206271,0.000554185,0.0069107,0,12038.8,0.00319814
4,8.61822,7.29712e-06,0.00315172,0.082303,18462300.0,0.321158,0.336062,0.00063456,0.0142708,0,141540.0,0.000248849
5,0.183293,9.72129e-07,2.04037e-05,0.0078426,1158810.0,0.00601201,0.235388,0.000329829,0.0063242,0,21287.8,4.65595e-05


In [102]:
#test_x = test_x.drop([5,9,10,11,12,13,14,15,17,18,19,21,22,23,26],axis=1)
#test_x = test_x.drop([5,17,18,19,20,21,22,26],axis=1)
#test_x.iloc[:5]

Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,12,13,14,15,16,23,24,25,27,28
1,23,3,21,29,29,52,32,21,0.0,47.7849,...,0.0,667,27,0.0,0.0,0.139785,0.00110861,0.00475634,7611.81,0.000973567
2,74,13,71,9,9,83,22,71,0.0,49.5286,...,0.0,666,31,0.0,0.0,0.0680789,5.10988e-05,0.000454739,9233.75,0.00214473
3,255,80,205,17,17,272,97,205,0.462508,356.919,...,0.0508183,4335,356,0.00462379,0.000692042,0.206271,0.000554185,0.0069107,12038.8,0.00319814
4,511,32,506,36,36,547,68,506,0.178805,1223.15,...,0.0148185,18396,1847,2.80927e-05,0.000108719,0.336062,0.00063456,0.0142708,141540.0,0.000248849
5,21,5,18,46,46,67,51,18,0.0,75.4785,...,0.0,966,54,0.0,0.0,0.235388,0.000329829,0.0063242,21287.8,4.65595e-05


In [142]:
test_x = scaler.transform(test_x)
#test_x = pca.transform(test_x)
print('test shape:',test_x.shape)
print(test_x[0])

test shape: (2000, 12)
[-0.03053658 -0.23081383 -0.42711944 -0.4066574  -0.05150469 -0.02006503
 -0.03340191 -0.29370497 -0.50401396 -0.24608051 -0.02877678 -0.0269635 ]


In [129]:
test_pred = grid_search.predict(test_x)

In [144]:
test_pred = rf.predict_proba(test_x)[:,1]

In [145]:
test_pred

array([0.  , 0.  , 0.17, ..., 0.  , 0.  , 0.  ])

In [146]:
results = []
cnt = 1
for y in test_pred:
    results.append((cnt,y))
    cnt += 1

# save results
save_as_csv(results,'./results/rf_12_features_prediction_clf.csv')

./results/rf_12_features_prediction_clf.csv saved...
