SMOTE is performed on CPC dataset.

In [73]:
import numpy as np
import pandas as pd
import json
import glob, os
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import random
from sklearn.model_selection import StratifiedShuffleSplit
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

In [38]:
categories_index_path = '../data/processed/mm-cpc-generator/train/categorical-vocab.json'
cat_idx = open(categories_index_path)
cat_idx_read = cat_idx.read()
cat_index = json.loads(cat_idx_read)

In [39]:
#identifying all the csv paths from the train processed folder
data_path = '../data/processed/mm-cpc-generator/train/'
data_paths = glob.glob(os.path.join(data_path,'*.csv'))
print ('There are total {} csv files in the folder'.format(len(data_paths)))

There are total 202 csv files in the folder


In [40]:
#filtering positive class paths from all the csv files
pos_paths = []
for path in data_paths:
    if 'positive' in path:
        pos_paths.append(path)
print('out of which, {} are positve class csv files'.format(len(pos_paths)))

out of which, 101 are positve class csv files


In [41]:
#importing all positive class csv s as dataframes and then concatenate them 
pos_files_dict = {}
pos_df = pd.DataFrame()
i = 0
for path in pos_paths:
    pos_files_dict[i] = pd.read_csv(path,index_col=None, header=0)
    pos_df = pd.concat([pos_df,pos_files_dict[i]])
    i +=1

pos_df = pos_df.reset_index(drop=True) #resets index of concatenated dataframe

In [42]:
pos_df.head()

Unnamed: 0,exchange_id,user_frequency,site_id,deal_id,channel_type,size,week_part,day_of_week,dma_id,isp_id,...,am_859849_bpr,mm_1213050_bpr,mm_1213051_bpr,mm_1213052_bpr,mm_1213053_bpr,mm_1213054_bpr,mm_1213056_bpr,mm_1213057_bpr,mm_1213058_bpr,column_weights
0,44,99,4383729,0,2,55968224,0,2,80012,30001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.561625
1,30,99,2057062530,0,2,27918576,0,1,80097,30003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.561625
2,1005,99,669618330,187073,2,41943400,0,1,80017,30001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.561625
3,1005,99,967440096,187073,2,41943400,0,2,80003,30025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.561625
4,30,99,1278868291,0,2,27918576,0,1,80201,30014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.561625


In [43]:
print('number of positive class labeled datarows: {}'.format(pos_df.shape[0]))

number of positive class labeled datarows: 400979


In [44]:
#pick 12500 samples randomly from pos_df
a = random.sample(range(pos_df.shape[0]), 12500)
pos_df_sample = pos_df.iloc[a,:].reset_index(drop=True)

In [45]:
pos_df_sample.shape

(12500, 97)

In [46]:
pos_df_sample['y'] = 1

In [47]:
pos_df_sample.shape

(12500, 98)

In [48]:
neg_df = pd.DataFrame()
i = 0
for path in data_paths:
    if 'negative' in path:
        temp = pd.read_csv(path,index_col=None, header=0)
        neg_df = pd.concat([neg_df,temp])
        i +=1
        if neg_df.shape[0] > 100000: #limiting loading all negative datapoints to 100k
            break

neg_df_sample = neg_df.reset_index(drop=True)

In [49]:
neg_df_sample.shape

(102990, 97)

In [50]:
neg_df_sample['y'] = 0

In [51]:
neg_df_sample.shape

(102990, 98)

In [52]:
df = pd.concat([neg_df_sample,pos_df_sample]).reset_index(drop=True)

In [53]:
df.shape

(115490, 98)

In [54]:
for col in df.columns:
    if col in cat_index.keys(): #checking if col is part of categorical columns 
        df[col] = df[col].astype('category')
    else:
        df[col] = df[col].astype('int64')

In [55]:
cat_columns = df.select_dtypes(['category']).columns #filter categorical columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

In [56]:
#exporting y label from main dataframe to create labels dataframe
y = df[['y']]

In [57]:
#dropping y label from main dataframe to build input features (X) dataframe
df = df.drop('y', axis=1)
df.shape

(115490, 97)

In [58]:
#generates indices using stratified shuffling method to create train and test datasets
sss = StratifiedShuffleSplit(n_splits=2,train_size=0.8,test_size=0.2,random_state=42)
idx1,idx2 = sss.split(df, y, groups=None)

In [59]:
len(idx1[0]),len(idx1[1])

(92392, 23098)

In [60]:
#sampling entire data into train and test dataframes using indices generated through SSS
x_tr,x_te = df.iloc[idx1[0],:].reset_index(drop=True),df.iloc[idx1[1],:].reset_index(drop=True)
y_tr,y_te = y.iloc[idx1[0],:].reset_index(drop=True),y.iloc[idx1[1],:].reset_index(drop=True)

In [61]:
sm = SMOTE(random_state=42,k_neighbors=20)
x_res, y_res = sm.fit_sample(x_tr, y_tr)

  y = column_or_1d(y, warn=True)


In [62]:
print('Original dataset shape {}'.format(Counter(y_tr['y'])))
print('Resampled dataset shape {}'.format(Counter(y_res)))
print ('Test dataset shape {}'.format(Counter(y_te['y'])))

Original dataset shape Counter({0: 82392, 1: 10000})
Resampled dataset shape Counter({0: 82392, 1: 82392})
Test dataset shape Counter({0: 20598, 1: 2500})


In [63]:
x_r = pd.DataFrame(x_res, columns = x_tr.columns)
x_r[cat_columns] = np.round(x_r[cat_columns])

y_r = pd.DataFrame(y_res, columns = y_tr.columns)

In [64]:
print ('shape of train data after SMOTE: {}'.format(x_r.shape))

shape of train data after SMOTE: (164784, 97)


In [65]:
shuffled_idx = random.sample(range(x_r.shape[0]),x_r.shape[0])

In [66]:
#shuffling data post SMOTE
x_r = x_r.iloc[shuffled_idx,:]
y_r = y_r.iloc[shuffled_idx,:]

In [67]:
param_grid = {'n_estimators':  [20],'random_state': [42]}
grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid.fit(x_r,y_r['y'].ravel())

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [20], 'random_state': [42]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [68]:
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
#print("test-set score: {:.3f}".format(grid.score(x_te, y_te)))


best mean cross-validation score: 1.000
best parameters: {'n_estimators': 20, 'random_state': 42}


In [69]:
grid.predict(x_te)

array([0, 0, 0, ..., 0, 0, 0])

In [70]:
Counter(grid.predict(x_te))

Counter({0: 20598, 1: 2500})

In [71]:

y_pr = grid.predict(x_te)
p, r , f, s = precision_recall_fscore_support(y_te, y_pr, average='binary')
print('precision: {}, recall: {}, fscore: {}'.format(p,r,f))

precision: 1.0, recall: 1.0, fscore: 1.0


In [74]:
y_pr_proba = grid.predict_proba(x_te)
y_pr_proba_1 = y_pr_proba[:,1] #probas for class being '1'
area = roc_auc_score(y_te, y_pr_proba_1)
print('roc_auc:{}'.format(area))

roc_auc:1.0


In [80]:
cross_entropy_loss = log_loss(y_te,y_pr_proba)
print('cross_entropy_loss :{:.5f}'.format(cross_entropy_loss))

cross_entropy_loss :0.00190
