In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.feature_extraction import DictVectorizer


In [2]:
df = pd.read_csv('../data/sample_data_cleaned.csv', sep=',', header=0, index_col=0)
df.head()

Unnamed: 0,site_id,strategy_id,list_type,line_id,adv_id,adv_vertical,name,goal,price,limit,...,win_rate_site,win_rate_strat,cvr_strat,cvr,line_cvr,hist_zscore,overlap,target,win_rate_site_table,win_rate_strat_table
0,82932,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.423778,0.111431,0.0,0.001197,0.0,2.708366,0.001066,0,0.450094,0.249479
1,90474,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.16301,0.111431,0.0,0.001239,0.0,1.188635,0.000703,0,0.15805,0.249479
2,92345,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.318358,0.111431,0.0,0.000729,0.0,1.503285,0.000873,0,0.360591,0.249479
3,92415,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.133199,0.111431,0.0,0.005894,0.0,35.153628,0.004614,0,0.113717,0.249479
4,92425,313729,testing,20049,206.0,Travel,Nicole,0.0,3.95,10000.0,...,0.37931,0.111431,0.0,0.0,0.0,-0.091378,0.000344,0,0.019308,0.249479


In [3]:

features = ['site_id', 'strategy_id', 'name', 'adv_vertical', 'goal', 'price', 'avg_bid', 'max_bid', 'win_rate_site', 'win_rate_strat', 'hist_zscore', 'overlap', 'target']
df_subset = df[features]


df['win_rate_ratio'] = df['win_rate_site']/df['win_rate_strat']

#define training and testing set
df['is_train'] = np.random.uniform(0,1, len(df)) > 0.25
train = df[df['is_train']==True][features]
test = df[df['is_train']==False][features]

#vectorize the feature set and one-hot encode adv_vertical
vec = DictVectorizer(sparse=False)
feats_train = train.T.to_dict().values()
features_train = vec.fit_transform(feats_train)
feats_test = test.T.to_dict().values()
features_test = vec.fit_transform(feats_test)

#convert back to dataframe and check for null values
train = pd.DataFrame(features_train, columns = vec.get_feature_names())
test = pd.DataFrame(features_test, columns = vec.get_feature_names())

x_train = train.drop(['target'], axis = 1)
y_train = train['target']
x_test = test.drop(['target'], axis = 1)
y_test = test['target']

In [7]:
kf = KFold( n_splits=5, shuffle=True)

feature_cols = vec.get_feature_names()

#vectorize the feature set and hot encode adv_vertical
vec = DictVectorizer(sparse=False)
feats= df_subset.T.to_dict().values()
features_coded = vec.fit_transform(feats)

#convert back to dataframe and check for null values
features_df = pd.DataFrame(features_coded, columns = vec.get_feature_names())

for train_index, test_index in kf.split(df_subset):
    x_train, x_test = features_df.loc[train_index, feature_cols], features_df.loc[test_index, feature_cols]
    y_train, y_test = features_df.loc[train_index, 'target'], features_df.loc[test_index, 'target']
    
    lr = LogisticRegression().fit(x_train, y_train)
    #lr = LogisticRegression().fit(x_train, y_train)
    y_pred = lr.predict(x_test)
    cm = confusion_matrix(y_test, y_pred)
    cm_norm = cm.astype('float')/cm.sum(axis=1)[:, np.newaxis]
    print(cm)
    print(cm_norm)
    #plot_confusion_matrix(cm)
    print "test accuracy: %0.3f \n" % lr.score(x_test, y_test)
    print "train accuracy: %0.3f \n" % lr.score(x_train, y_train)
    #print lr.summary()

    #print "Coefficients: \n", '\n'.join([': '.join([str(x) for x in pair]) for pair in zip(feature_cols, lr.coef_[0])])
    
#lr.save('P2A-logistic_regression.pickle')

[[388   0]
 [ 12   0]]
[[ 1.  0.]
 [ 1.  0.]]
test accuracy: 0.970 

train accuracy: 0.974 

[[389   0]
 [ 11   0]]
[[ 1.  0.]
 [ 1.  0.]]
test accuracy: 0.973 

train accuracy: 0.974 

[[395   0]
 [  5   0]]
[[ 1.  0.]
 [ 1.  0.]]
test accuracy: 0.988 

train accuracy: 0.970 

[[387   0]
 [ 12   0]]
[[ 1.  0.]
 [ 1.  0.]]
test accuracy: 0.970 

train accuracy: 0.974 

[[386   0]
 [ 13   0]]
[[ 1.  0.]
 [ 1.  0.]]
test accuracy: 0.967 

train accuracy: 0.975 

