In [83]:
import pandas as pd
import numpy as np
from MLsandbox import model_dict 

In [84]:
df = pd.read_csv('../data/sample_data_cleaned.csv', sep = ',', header = 0, index_col=0)
df.head()

Unnamed: 0,site_id,strategy_id,list_type,line_id,adv_id,adv_vertical,name,goal,price,limit,...,win_rate_site,win_rate_strat,cvr_strat,cvr,line_cvr,hist_zscore,overlap,target,win_rate_site_table,win_rate_strat_table
0,82932,313729,testing,20049,206,Travel,Nicole,0,3.95,10000,...,0.423778,0.111431,0,0.001197,0,2.708366,0.001066,0,0.450094,0.249479
1,90474,313729,testing,20049,206,Travel,Nicole,0,3.95,10000,...,0.16301,0.111431,0,0.001239,0,1.188635,0.000703,0,0.15805,0.249479
2,92345,313729,testing,20049,206,Travel,Nicole,0,3.95,10000,...,0.318358,0.111431,0,0.000729,0,1.503285,0.000873,0,0.360591,0.249479
3,92415,313729,testing,20049,206,Travel,Nicole,0,3.95,10000,...,0.133199,0.111431,0,0.005894,0,35.153628,0.004614,0,0.113717,0.249479
4,92425,313729,testing,20049,206,Travel,Nicole,0,3.95,10000,...,0.37931,0.111431,0,0.0,0,-0.091378,0.000344,0,0.019308,0.249479


## First, a simple split

#### Define the desired test-train split

Here, we want about 75% of records to be in the training set, with the remaining 25% serving as the test set

In [85]:
pct_train = 0.75

# For each row in df, generate a random number from 0 to 1.
# If the number is < pct_train for a row, value for 'is_train' = True.
df['is_train'] = np.random.uniform(0,1, len(df)) < 0.75
df[['is_train']].head()

Unnamed: 0,is_train
0,True
1,True
2,True
3,False
4,True


#### Define train and test data

These are the subsets of df that have 'is_train' = True and False, respectively

In [86]:
train, test = df[df['is_train']==True], df[df['is_train']==False]
print "Training set has %s records" %str(len(train))
print "Test set has %s records" %str(len(test))

Training set has 1484 records
Test set has 514 records


#### Split train and test data into features and target variables

and then print a list of features by their data types

In [87]:
x_train = train.drop(['target'], axis = 1)
y_train = train['target']
x_test = test.drop(['target'], axis = 1)
y_test = test['target']

## Use sklearn's built-in test_train_split function

The first step is to define the columns in our data frame that are features.  Then we can separate the dataframe into features versus target, like so:  

    features_data, target_data = df[features], df[target]

This becomes most useful when you want to remove certain features from the model that were included in your original dataset



In [88]:
from sklearn.model_selection import train_test_split

features = [x for x in df.columns if x != 'target']
target = 'target'

#### Now, implement sklearn's train_test_split

We are simultaneously splitting the dataframe by columns and by rows:
- by columns: by indicating that df[features] are the features columns
- by rows: by indicating that we want each row considered for the test set with 0.3 or 30% probability



In [89]:
X_train, X_test, Y_train, Y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=12)

print 'X_train and Y_train have {0} records: {1} successes and {2} failures'.format(str(len(X_train)), str(sum(Y_train)), str(len(X_train) - sum(Y_train)))
print 'X_test and Y_test have {0} records: {1} successes and {2} failures'.format(str(len(X_test)), str(sum(Y_test)), str(len(X_test) - sum(Y_test)))

X_train and Y_train have 1598 records: 46 successes and 1552 failures
X_test and Y_test have 400 records: 7 successes and 393 failures


## Stratified split using sklearn

When dealing with rare events, you may find that a typical train-test split leaves one subset with very few examples of a target class.  To get an accurate picture of recall and precision, you should have train and test sets with similar class balance.  StratifiedShuffleSplit is one way to do this.

In [90]:
from sklearn.model_selection import StratifiedShuffleSplit as strat_split

sss = strat_split(n_splits=1, test_size=0.2, random_state=12)

for train_index, test_index in sss.split(df[features], df[target]):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    Xtrain, Xtest = df.ix[train_index, features], df.ix[test_index, features]
    ytrain, ytest = df.ix[train_index, target], df.ix[test_index, target]
    
print 'X_train and Y_train have {0} records: {1} successes and {2} failures'.format(str(len(Xtrain)), str(sum(ytrain)), str(len(Xtrain) - sum(ytrain)))
print 'X_test and Y_test have {0} records: {1} successes and {2} failures'.format(str(len(Xtest)), str(sum(ytest)), str(len(Xtest) - sum(ytest)))

('TRAIN:', 1598, 'TEST:', 400)
X_train and Y_train have 1598 records: 42 successes and 1556 failures
X_test and Y_test have 400 records: 11 successes and 389 failures


## K-fold cross-validation

Separates the data into K "folds" of equal length.  Then, each fold is held out as a test set while the model is trained on the remaining data.  This produces a cross-validation score K times, which can be averaged to approximate an accurate depiction of how the model will perform on a new dataset.

In [91]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=2)
for train, test in kf.split(df):
    xtrain, xtest = df.ix[train, features], df.ix[test, features]
    ytrain, ytest = df.ix[train, target], df.ix[test, target]
    # fit model on training data here
    # predict on test data here
    # print evaluation score here

#### read more here: https://pdfs.semanticscholar.org/0be0/d781305750b37acb35fa187febd8db67bfcc.pdf