In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats.stats import pearsonr

from sklearn.preprocessing import LabelEncoder

from time import time
from collections import Counter
import cPickle as pickle

In [17]:
PATH_CSV = '../data/train.csv'
# PATH_CSV = '../data/train_med.csv'
# PATH_CSV = '../data/test.csv'
mode = os.path.splitext(os.path.basename(PATH_CSV))[0]
print mode

train


In [18]:
%%time
dateparser = lambda x: pd.to_datetime(x, format='%d%b%y:%H:%M:%S', coerce=True)

date_cols = ['VAR_0073', 'VAR_0075', 'VAR_0204', 'VAR_0217',
             'VAR_0156', 'VAR_0157', 'VAR_0158', 'VAR_0159', 
             'VAR_0166', 'VAR_0167', 'VAR_0168', 'VAR_0169', 
             'VAR_0176', 'VAR_0177', 'VAR_0178', 'VAR_0179']

df = pd.read_csv(PATH_CSV, parse_dates=date_cols, date_parser=dateparser,
                      index_col='ID', engine='c')

print df.shape

(145231, 1933)
CPU times: user 48.4 s, sys: 10.8 s, total: 59.2 s
Wall time: 5min 28s


In [19]:
# Drop useless rows
if mode == 'train':
    df.dropna(subset=['VAR_0008'], axis=0, inplace=True)
print df.shape

(145175, 1933)


In [20]:
# Drop useless columns
useless_col_path = '../saved/useless_cols.p'
if os.path.exists(useless_col_path):
    useless_cols = pickle.load(open(useless_col_path, 'rb'))
else:
    useless_cols = set()
    for col in df.columns:
        if len(df[col].unique()) == 1:
            useless_cols.add(col)

    pickle.dump(useless_cols, open('../saved/useless_cols.p', 'wb'))

df.drop(useless_cols, axis=1, inplace=True)
print df.shape

(145175, 1889)


In [21]:
# Drop columns that we may want to deal with later
# VAR_0200 > city
# VAR_0404 > another occupation?
# VAR_0493 > occupation
# VAR_0205 > continuous, but most rows are NaN
# VAR_0214 > social sec digits or phone #? most rows are NaN

handle_later_cols = [
    'VAR_0200', 'VAR_0404', 'VAR_0493', 'VAR_0205', 'VAR_0214',
]
df.drop(handle_later_cols, axis=1, inplace=True)
print df.shape


(145175, 1884)


# Transform date columns

In [22]:
%%time
for col in date_cols:
    dti = pd.DatetimeIndex(df[col])
    df[col + '-year'] = dti.year
    df[col + '-month'] = dti.month
    df[col + '-dom'] = dti.day
    df[col + '-dow'] = dti.dayofweek
    df[col + '-doy'] = dti.dayofyear
    
    if any(df[col].isnull()):
        df[col + '-null'] = df[col].isnull()
        df[col + '-year'].fillna(0, inplace=True)
        df[col + '-month'].fillna(0, inplace=True)
        df[col + '-dom'].fillna(0, inplace=True)
        df[col + '-dow'].fillna(0, inplace=True)
        df[col + '-doy'].fillna(0, inplace=True)
df.drop(date_cols, axis=1, inplace=True)

print df.shape

(145175, 1961)
CPU times: user 1.08 s, sys: 907 ms, total: 1.99 s
Wall time: 2.45 s


# One hot encoding the text columns

In [23]:
str_cols = []
num_cols = []
for col in df.columns:
    if type(df[col][df[col].notnull()].iloc[0]) is str:
        str_cols.append(col)
    else:
        num_cols.append(col)

In [24]:
%%time
# one hot encode

for col in str_cols:
#     enc = pd.get_dummies(pd.concat([train_pd[col], test_pd[col]]))
    enc = pd.get_dummies(df[col])
    enc.columns = ['-'.join([col, str(enc_col)]) for enc_col in enc.columns]
    
    df = pd.concat([df, enc[:len(df)].astype(bool)], axis=1)
    df.drop(col, axis=1, inplace=True)
#     test_pd = pd.concat([test_pd, enc[-len(test_pd):].astype(bool)], axis=1)
#     test_pd.drop(col, axis=1, inplace=True)

print df.shape

(145175, 2153)
CPU times: user 24 s, sys: 59.4 s, total: 1min 23s
Wall time: 3min 16s


## Fill Nan

In [25]:
df.fillna(0, inplace=True)

## Convert boolean objects to int

In [26]:
g = df.columns.to_series().groupby(df.dtypes).groups
if np.dtype('O') in g.keys():
    obj_cols = g[np.dtype('O')]
    for col in  obj_cols:
        df[col] = df[col].astype(int)
print df.shape

(145175, 2153)


# Remove test columns not in training

In [27]:
%%time

if mode == 'test':
    train_cols = pickle.load(open('../saved/train_cols.p', 'rb'))
    for col in df.columns:
        if col not in train_cols:
            df.drop(col, axis=1, inplace=True)
    print train_cols.shape  
elif mode == 'train':
    pickle.dump(df.columns, open('../saved/train_cols.p', 'wb'))
    
print df.shape
          

(145175, 2153)
CPU times: user 3.33 ms, sys: 60 ms, total: 63.3 ms
Wall time: 1.37 s


# Insert empty column for any remaining columns present in train but not test

In [28]:
%%time

if mode == 'test':
    for col in set(train_cols) - set(df.columns) - {'target'}:
        df[col] = np.zeros(len(df))
print df.shape

(145175, 2153)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 62 µs


# Re-order columns to be the same order as train_cols

In [29]:
if mode == 'test':
    df = df[train_cols.drop('target')]
print df.shape

(145175, 2153)


# Save

In [None]:
# %%time
# Pickle
# df.to_pickle('../saved/train_preprocd_pd.p')

In [30]:
%%time
# HDF5
with pd.get_store('../saved/storage.h5') as store:
    store.put(mode, df, format='table')

CPU times: user 6.65 s, sys: 2min 34s, total: 2min 41s
Wall time: 7min 23s


# Load and save into other formats

In [1]:
%%time
import pandas as pd
import numpy as np
PATH_STORE = '../saved/storage.h5'
mode = 'test'
df = pd.read_hdf(PATH_STORE, mode)


CPU times: user 300 ms, sys: 2.05 s, total: 2.35 s
Wall time: 44.8 s


In [2]:
%%time
# train_pd.fillna(train_pd.mean(), inplace=True)
df.fillna(0, inplace=True)

CPU times: user 1.81 s, sys: 547 ms, total: 2.36 s
Wall time: 2.32 s


In [5]:
if mode == 'train':
    labels = df['target']
    df.drop(['target'], axis=1, inplace=True)
else:
    labels = np.zeros(len(df))

In [7]:
%%time
PATH_SAVE_LIBSVM = '/tmp/%s.libsvm' % mode

from sklearn.datasets import dump_svmlight_file, load_svmlight_file
# from svmlight_loader import dump_svmlight_file, load_svmlight_file

dump_svmlight_file(X=df, y=labels,
                  f=PATH_SAVE_LIBSVM)


CPU times: user 23min 27s, sys: 53 s, total: 24min 20s
Wall time: 26min 40s


# Scratchspace


## Processing Numerical Columns

Some numerical columns contain values that obviously represent some sort of categorical flag (ie. 99998). We need to detect which columns contain these flag values, what the values are, and finally, strip the values out to separate one-hot encoded columns. 

### Detection

Counter -> find values that occur many (say n/100) times. Check that the most significant digit is 9... just because that's how it seems SpringLeaf has encoded the data.

