In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
import sys

In [2]:
sys.path.insert(1, "../scripts")
from ml_processors import ML_Processor as MLP

mlp = MLP()

In [3]:
ml_data =  pd.read_csv('../data/chrome_data.csv')
ml_data.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,browser,yes,no,SAID_YES
0,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,Chrome Mobile,1,0,1
1,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,Chrome Mobile,0,1,0
2,00ebf4a8-060f-4b99-93ac-c62724399483,control,2020-07-03,15,Generic Smartphone,Chrome Mobile,0,1,0
3,013e45cf-e388-46a4-9c5b-a34303613940,exposed,2020-07-10,2,Generic Smartphone,Chrome Mobile,0,1,0
4,018af862-486e-4da1-a85b-71872120e57c,control,2020-07-03,15,Generic Smartphone,Chrome Mobile,1,0,1


In [4]:
columns_to_be_dropped = ['no','SAID_YES']
ml_data_clean = ml_data.drop(columns_to_be_dropped,axis=1)
ml_data_clean.head(2)

Unnamed: 0,auction_id,experiment,date,hour,device_make,browser,yes
0,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-04,16,Generic Smartphone,Chrome Mobile,1
1,00a1384a-5118-4d1b-925b-6cdada50318d,exposed,2020-07-06,8,Generic Smartphone,Chrome Mobile,0


In [5]:

cat, num = mlp.sep_cat_num(ml_data_clean)
print("Numerical Columns:", cat)
print("****************")
print("Categorical Columns:", num)

Numerical Columns: ['auction_id', 'experiment', 'date', 'device_make', 'browser']
****************
Categorical Columns: ['hour', 'yes']


In [6]:
ml_data_clean[cat].describe()

Unnamed: 0,auction_id,experiment,date,device_make,browser
count,695,695,695,695,695
unique,695,2,8,14,1
top,008aafdf-deef-4482-8fec-d98e3da054da,exposed,2020-07-03,Generic Smartphone,Chrome Mobile
freq,1,371,176,665,695


In [7]:
pipe_1 = Pipeline(
    steps=[
        ("label categories", FunctionTransformer(mlp.cat_labeler, kw_args={"cat_cols": cat})),
        ("scale data", FunctionTransformer(mlp.scaler)),
        ("separate target and features", FunctionTransformer(mlp.target_feature, kw_args={"f_r": [0, 6], "t":-1})),
        ("divide dataset", FunctionTransformer(mlp.set_splitter, kw_args={"test": 0.1, "val":0.2, "rand_state":8})),      
        ])
sets = pipe_1.fit_transform(ml_data_clean)

cat_labeler output...

   auction_id  experiment  date  hour  device_make  browser  yes
0           0           1     1    16            1        0    1
1           1           1     3     8            1        0    0


scaler output... 

   auction_id  experiment      date      hour  device_make  browser  yes
0    0.000000         1.0  0.142857  0.695652     0.076923      0.0  1.0
1    0.001441         1.0  0.428571  0.347826     0.076923      0.0  0.0


target_features output... 

features size: (695, 6)


set_splitter output... 

X_train shape: (512, 6)
y_train shape: (512,)
x_test shape: (70, 6)
y_test shape: (70,)
X_val shape: (113, 6)
y_val shape: (113,)


In [8]:
pipe_2 =Pipeline([('scalar1',StandardScaler()),('lr_classifier',LogisticRegression(random_state=2))])

In [9]:
pipe_2.fit(sets[0], sets[1])
pipe_2.score(sets[4],sets[5])

0.5663716814159292