# Data Preparation for All State Kaggle Challenge

In [1]:
'''
Imports
'''
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder

In [2]:
'''
Routines for two preprocessing options:
preProcess1: binarize categorical variables and apply a minimum variance threshold.
preProcess2: convert categorical variable labels to integers.
'''

def preProcess1(df, prob):
    
    # Make a dictionary of the number of levels for each categorical feature.
    catdict  = {key: 0 for key in df.columns.values if key[:3]=="cat"}
    for var in catdict.keys():
        catdict[var] = len(df[var].unique())
    print("Total number of categorical feature levels: {0}".format(sum(catdict.values())))

    # Binarize categorical features
    df = pd.get_dummies( df, drop_first=False)
    print("Shape of data frame after binarization: {0}".format(df.shape))

    # Eliminate binary features for which the minority label appears 
    # in a fraction less than prob of instances
    cats     = [feature for feature in df.columns.values if feature[:3]=="cat"]
    conts    = [feature for feature in df.columns.values if feature[:4]=="cont"]
    binvar   = prob * (1.0-prob)
    sel      = VarianceThreshold(threshold=binvar)
    sel.fit(df[cats])
    retain   = sel.get_support(indices=True)
    features = [cats[ind] for ind in retain] + conts + ["loss"]
    df       = df[features]
    print("Shape of data frame after variance filter: {0}".format(df.shape))

    # Eliminate one dummy binary per category not affected by the low-variance filter.
    remove = []
    for key,nlevels in catdict.items():
        binlist = [feature for feature in features if key+"_" in feature]
        if len(binlist) == nlevels:
            remove.append(binlist[0])
    keep = [feature for feature in features if feature not in remove]
    df = df[keep]
    print("Shape of data frame after dummy elimination: {0}\n".format(df.shape))
    
    # Done
    return df

def preProcess2(df):
    
    # Convert categorical variable labels to integers.
    df.iloc[:,:116] = df.iloc[:,:116].apply(LabelEncoder().fit_transform)
    
    # Done
    return df

In [None]:
'''
Load and preprocess the AllState data. Each row consists of one index, 
116 categorical predictors, 14 continuous predictors, and (in the case 
of the training data) one continuous response variable called loss.

Note that we combine train and test data before doing the preprocessing.
This is to avoid problems with categorical variables that have labels
appearing in one set but not in the other. Binarizing or integerizing 
such variables would lead to label confusion across sets.
'''

# Choose preprocessing options:
preProc = 1
prob    = 0.05
assert preProc==1 or preProc==2, "Invalid preprocessing option"
assert preProc==2 or (prob>=0.0 and prob<=1.0), "Invalid probability value"

# Read entire training and testing data sets, and combine them for preprocessing purposes.
# But first reshuffle training set to avoid bias when we split it into training/validation/testing subsets.
dfa = pd.read_csv("data/train.csv", delimiter=",", header=0, index_col=0, nrows=None)
dfaLength = len(dfa)
df0 = dfa.sample(n=dfaLength, replace=False, weights=None, random_state=123, axis=0)
df0Length = len(df0)
print('Shape of training data frame: {0}'.format(df0.shape))
df1 = pd.read_csv("data/test.csv", delimiter=",", header=0, index_col=0, nrows=None)
df1Length = len(df1)
print('Shape of testing data frame:  {0}'.format(df1.shape))
df1.loc[:,'loss'] = pd.Series([-1.0]*df1Length, index=df1.index)
df2 = df0.append(df1)
print('Shape of combined data frame: {0}'.format(df2.shape))

# Do the preprocessing
if preProc==1:
    df2 = preProcess1(df2, prob)
elif preProc==2:
    df2 = preProcess2(df2)
    
# Split data frame back into training and test sets
df0, df1 = df2.iloc[:df0Length,:], df2.iloc[df0Length:,:]

Shape of training data frame: (188318, 131)
Shape of testing data frame:  (125546, 130)
Shape of combined data frame: (313864, 131)
Total number of categorical feature levels: 1176
Shape of data frame after binarization: (313864, 1191)


In [4]:
'''
Save preprocessed data frames
'''
if preProc==1:
    train_fname = 'data/train_prep1_prob{0:03d}.pkl'.format(int(1000*prob))
else:
    train_fname = 'data/train_prep2.pkl'
try:
    df0.to_pickle(train_fname)
    print('Training data frame saved to {0}'.format(train_fname))
except:
    print('Error saving training data frame to {0}'.format(train_fname))

test_fname = train_fname.replace('train', 'test')
try:
    df1.to_pickle(test_fname)
    print('Testing data frame saved to {0}'.format(test_fname))
except:
    print('Error saving testing data frame to {0}'.format(test_fname))

Training data frame saved to data/train_prep1_prob010.pkl
Testing data frame saved to data/test_prep1_prob010.pkl
