In [1]:
# for text patterns
# import re

# for math stuff
import numpy as np
# for handling the dataset
import pandas as pd
# for data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# for imputing missing values
from sklearn.preprocessing import Imputer

# model used for classification
from sklearn.ensemble import RandomForestClassifier
# metric used to measure the performance of the classifier
from kaggler.metrics.regression import gini

# for reproducibility
np.random.seed(0)

sns.set(style="white", context="talk")
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv', na_values=-1)
test  = pd.read_csv('test.csv',  na_values=-1)

In [3]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2.0,5,1.0,0.0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1.0,7,0.0,0.0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4.0,9,1.0,0.0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1.0,2,0.0,0.0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2.0,0,1.0,0.0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [4]:
test.tail()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
892811,1488022,0,1.0,6,0.0,0.0,0,1,0,0,...,4,2,3,4,0,1,0,0,1,0
892812,1488023,5,3.0,5,1.0,0.0,0,0,1,0,...,6,2,2,11,0,0,1,1,0,0
892813,1488024,0,1.0,5,0.0,0.0,1,0,0,0,...,5,2,2,11,0,1,1,0,0,0
892814,1488025,6,1.0,5,1.0,0.0,0,0,0,1,...,1,1,2,7,1,1,0,0,0,0
892815,1488026,7,1.0,4,1.0,0.0,0,0,0,1,...,5,2,2,7,0,1,1,1,0,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
id                595212 non-null int64
target            595212 non-null int64
ps_ind_01         595212 non-null int64
ps_ind_02_cat     594996 non-null float64
ps_ind_03         595212 non-null int64
ps_ind_04_cat     595129 non-null float64
ps_ind_05_cat     589403 non-null float64
ps_ind_06_bin     595212 non-null int64
ps_ind_07_bin     595212 non-null int64
ps_ind_08_bin     595212 non-null int64
ps_ind_09_bin     595212 non-null int64
ps_ind_10_bin     595212 non-null int64
ps_ind_11_bin     595212 non-null int64
ps_ind_12_bin     595212 non-null int64
ps_ind_13_bin     595212 non-null int64
ps_ind_14         595212 non-null int64
ps_ind_15         595212 non-null int64
ps_ind_16_bin     595212 non-null int64
ps_ind_17_bin     595212 non-null int64
ps_ind_18_bin     595212 non-null int64
ps_reg_01         595212 non-null float64
ps_reg_02         595212 non-null float64

In [7]:
features = test.columns

In [8]:
X = pd.concat([train[features], test])

In [9]:
y = train.target

In [10]:
train, test = None, None

In [11]:
X.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,2,2.0,5,1.0,0.0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
1,9,1,1.0,7,0.0,0.0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
2,13,5,4.0,9,1.0,0.0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
3,16,0,1.0,2,0.0,0.0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,2.0,0,1.0,0.0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [12]:
X.tail()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
892811,1488022,0,1.0,6,0.0,0.0,0,1,0,0,...,4,2,3,4,0,1,0,0,1,0
892812,1488023,5,3.0,5,1.0,0.0,0,0,1,0,...,6,2,2,11,0,0,1,1,0,0
892813,1488024,0,1.0,5,0.0,0.0,1,0,0,0,...,5,2,2,11,0,1,1,0,0,0
892814,1488025,6,1.0,5,1.0,0.0,0,0,0,1,...,1,1,2,7,1,1,0,0,0,0
892815,1488026,7,1.0,4,1.0,0.0,0,0,0,1,...,5,2,2,7,0,1,1,1,0,0


In [13]:
del X['id']

https://www.kaggle.com/bertcarremans/data-preparation-exploration

In [20]:
s = X.isnull().sum()/len(X)*100
s[s>0]

Series([], dtype: float64)

In [15]:
X.drop(['ps_car_03_cat', 'ps_car_05_cat'],inplace=True, axis=1)

In [16]:
# Imputing with the median
median_imp = Imputer(strategy='median', axis=0)
X['ps_car_12'] = median_imp.fit_transform(X[['ps_car_12']]).ravel()

# Imputing with the mode
mode_imp = Imputer(strategy='most_frequent', axis=0)
X['ps_car_11'] = median_imp.fit_transform(X[['ps_car_11']]).ravel()

In [17]:
# Imputing with random data
feat = 'ps_car_14'
quantity = len(X[X[feat].isnull()])
mu, std  = X[~X[feat].isnull()][feat].mean(), X[~X[feat].isnull()][feat].std()

# using the std and the mean
values = np.random.randn(quantity)*std + mu

X.loc[X[feat].isnull(), feat] = values

In [18]:
# Imputing with random data
feat = 'ps_reg_03'
quantity = len(X[X[feat].isnull()])
mu, std  = X[~X[feat].isnull()][feat].mean(), X[~X[feat].isnull()][feat].std()

# using the std and the mean
values = np.random.randn(quantity)*std + mu

X.loc[X[feat].isnull(), feat] = values

In [19]:
# categorical features
# categorical_features = ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 
#                        'ps_car_02_cat', 'ps_car_07_cat', 'ps_car_09_cat']
X = X.fillna(value=-1)

In [None]:
rec = {}
for _ in X.ps_car_11_cat.values:
    if _ in rec:
        rec[_] += 1
    else:
        rec[_] = 1

In [36]:
total = len(X.ps_car_11_cat.values)
for _, idx in enumerate(rec):
    rec[idx] = float(rec[idx])/total

In [41]:
X['ps_car_11_cat_super'] = X.ps_car_11_cat.apply(lambda idx: rec[idx])

In [43]:
X[['ps_car_11_cat', 'ps_car_11_cat_super']].head()

Unnamed: 0,ps_car_11_cat,ps_car_11_cat_super
0,12,0.012316
1,19,0.008424
2,60,0.013402
3,104,0.143135
4,82,0.017581


In [46]:
del X['ps_car_11_cat']

In [None]:
1/0

In [None]:
data = []
for f in train.columns:
    # Defining the role
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
         
    # Defining the level
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
        
    # Initialize keep to True for all variables except for id
    keep = True
    if f == 'id':
        keep = False
    
    # Defining the data type 
    dtype = train[f].dtype
    
    # Creating a Dict that contains all the metadata for the variable
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [None]:
meta

In [None]:
meta[(meta.level == 'ordinal') & (meta.keep)].index

In [None]:
pd.DataFrame({'count' : meta.groupby(['role', 'level'])['role'].size()}).reset_index()

In [None]:
s = train.isnull().sum()/len(train)*100
s[s>0]

In [None]:
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
train.drop(vars_to_drop, inplace=True, axis=1)

In [None]:
s = train.isnull().sum()/len(train)*100
s[s>0]

In [None]:
trainset.info()

In [None]:
total = len(trainset)

ax = sns.countplot(data=trainset, x='target')
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.3f}'.format(height/total),
            ha="center") 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
trainset.fillna(-1, inplace=True)

In [None]:
clf = RandomForestClassifier(n_estimators=150, max_depth=8, min_samples_leaf=4, max_features=0.2, n_jobs=-1, random_state=0)

In [None]:
clf.fit(trainset.drop(['target', 'id'],axis=1), trainset.target)

In [None]:
features = trainset.drop(['target', 'id'],axis=1).columns.values

In [None]:
feat_imp = {}
for _ in zip(features, clf.feature_importances_):
    feat_imp[_[0]] = _[1]

In [None]:
for key, value in sorted(feat_imp.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    print "%s: %s" % (key, value)

In [None]:
feat = 'ps_car_11_cat'
x = trainset[~trainset[feat].isnull()][feat]

In [None]:
sns.countplot(x)

In [None]:
sns.boxplot(data=trainset, x='ps_ind_05_cat', y='ps_car_13')

In [None]:
sns.countplot(data=trainset, x='ps_ind_05_cat')

In [None]:
sns.boxplot(data=trainset, x='ps_ind_05_cat', y='ps_car_13', hue='target')

In [None]:
sns.boxplot(data=trainset, x='ps_ind_05_cat', y='ps_reg_03', hue='target')

In [None]:
sns.jointplot(x="ps_car_13", y="ps_reg_03", data=trainset)

In [None]:
sns.pairplot(data=trainset[['ps_car_13', 'ps_reg_03', 'target']], hue='target', vars=['ps_car_13', 'ps_reg_03'])

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
l  = trainset[trainset.target == 1]
l2 = trainset[trainset.target == 0]

In [None]:
X = l[['ps_car_13', 'ps_ind_14', 'ps_reg_02', 'target']].sample(1000)
X = pd.concat([l2[['ps_car_13', 'ps_ind_14', 'ps_reg_02', 'target']].sample(1000), X])

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X['ps_car_13'], X['ps_ind_14'], X['ps_reg_02'], c=X['target'])
plt.show()

In [None]:
pred = clf.predict(testset)

In [None]:
# making the submission
_id = testset.id

In [None]:
del testset['id']

In [None]:
# pred = clf.predict(X_test)

submission = pd.DataFrame({'id':_id, 'target':pred}) 
submission.to_csv("submission.csv", index=False)

In [None]:
892816 in _id

In [None]:
print len(testset)
print len(_id)