In [1]:
# coding: utf-8

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pdb

# Import Seaborn for plotting
# and ignore all warnings
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns

sns.set(style="white", color_codes=True)

# Load data as Pandas dataframes
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

train.head()
# Press shift+enter to execute this cell

# Targets: customer satisfaction
# 0 = happy, 1 = unhappy
# Over 96% are happy 
df = pd.DataFrame(train.TARGET.value_counts())
df['Percentage'] = 100*df['TARGET']/train.shape[0]
df

Unnamed: 0,TARGET,Percentage
0,73012,96.043147
1,3008,3.956853


In [2]:
########################################
# Explore var3
########################################

# var3: nationality of customer

# Ten most common values
train.var3.value_counts()[:10]

# -999999 = unknown nationality
train.loc[train.var3==-999999].shape

# Replace -999999 with most common value (nationality = 2) 
train = train.replace(-999999, 2)
train.loc[train.var3==-999999].shape

# Create new feature: number of zeros in a row
X = train.iloc[:, :-1]
y = train.TARGET
X['n0'] = (X==0).sum(axis=1)
train['n0'] = X['n0']

In [6]:
########################################
# Explore num_var4
########################################

# num_var4: the number of products that a customer has purchased
train.num_var4.hist(bins=100)
plt.xlabel('Number of bank products')
plt.ylabel('Number of customers in train')
plt.title('Most customers have 1 product with the bank')
plt.show()

# Customer satisfaction versus number of products purchased
sns.FacetGrid(train, hue="TARGET", size=6).map(plt.hist, "num_var4").add_legend()
plt.title('Unhappy customers purchased fewer products')
plt.show()

In [15]:
########################################
# Explore num_var38
########################################

print train.var38.describe()

# var38 for unhappy customers
print train.loc[train['TARGET']==1, 'var38'].describe()

# Distribution of var38 is not Gaussian
train.var38.hist(bins=1000);
plt.show()

# Show distribution in log-scale to clarify
train.var38.map(np.log).hist(bins=1000);
plt.show()

# Identify the anomaly: 
# a spike between values 11 and 12 of the distribution
train.var38.map(np.log).mode()

# Most common values for var38
print train.var38.value_counts()

# Most common value is close to the mean of the other values
print train.var38[train['var38'] != 117310.979016494].mean()

# Excluding the most common value causes the
# distribution to become normal (in log-scale)
print train.loc[~np.isclose(train.var38, 117310.979016), 'var38'].value_counts()
train.loc[~np.isclose(train.var38, 117310.979016), 'var38'].map(np.log).hist(bins=100);
plt.show()

# Split var38
# var38mc == 1 when var38 has the most common value and 0 otherwise
# logvar38 = {log(var38) if var38mc == 0; 0 otherwise}
train['var38mc'] = np.isclose(train.var38, 117310.979016)
train['logvar38'] = train.loc[~train['var38mc'], 'var38'].map(np.log)
train.loc[train['var38mc'], 'logvar38'] = 0

# Check for NaN
print('Number of nan in var38mc', train['var38mc'].isnull().sum())
print('Number of nan in logvar38',train['logvar38'].isnull().sum())


count    7.602000e+04
mean     1.172358e+05
std      1.826646e+05
min      5.163750e+03
25%      6.787061e+04
50%      1.064092e+05
75%      1.187563e+05
max      2.203474e+07
Name: var38, dtype: float64
count    3.008000e+03
mean     9.967828e+04
std      1.063098e+05
min      1.113663e+04
25%      5.716094e+04
50%      8.621997e+04
75%      1.173110e+05
max      3.988595e+06
Name: var38, dtype: float64
117310.979016    14868
451931.220000       16
463625.160000       12
288997.440000       11
104563.800000       11
236690.340000        8
104644.410000        7
125722.440000        7
329603.970000        7
128318.520000        7
67088.310000         7
100466.730000        6
105260.880000        6
163432.470000        6
168733.620000        6
70813.800000         6
185385.690000        6
97639.560000         6
227397.720000        5
229351.650000        5
185784.720000        5
93037.680000         5
171932.700000        5
121603.020000        5
71302.530000         5
208961.790000    

In [17]:
########################################
# Explore num_var15
########################################

# var15 = customer age
# XGBoost gave high importance to var15
print train['var15'].describe()
train['var15'].hist(bins=100);

sns.FacetGrid(train, hue="TARGET", size=6).map(sns.kdeplot, "var15").add_legend()
plt.title('Unhappy customers are slightly older');

# var15 versus var38
sns.FacetGrid(train, hue="TARGET", size=10).map(plt.scatter, "var38", "var15").add_legend();
sns.FacetGrid(train, hue="TARGET", size=10).map(plt.scatter, "logvar38", "var15").add_legend()
plt.ylim([0,120]);

# Exclude most common value of var38 
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10).map(plt.scatter, "logvar38", "var15").add_legend()
plt.ylim([0,120]);

# Distribution of the age when var38 has its most common value
sns.FacetGrid(train[train.var38mc], hue="TARGET", size=6).map(sns.kdeplot, "var15").add_legend();

sns.FacetGrid(train, hue="TARGET", size=6).map(sns.kdeplot, "n0").add_legend()
plt.title('Unhappy customers have a lot of features that are zero');
plt.show()

count    76020.000000
mean        33.212865
std         12.956486
min          5.000000
25%         23.000000
50%         28.000000
75%         40.000000
max        105.000000
Name: var15, dtype: float64


In [22]:
########################################
# Feature selection
########################################

from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif,chi2
from sklearn.preprocessing import Binarizer, scale

p = 3

X_bin = Binarizer().fit_transform(scale(X))
selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y)
selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

chi2_selected = selectChi2.get_support()
chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
print('Chi2 selected {} features {}.\n'.format(chi2_selected.sum(),
   chi2_selected_features))
f_classif_selected = selectF_classif.get_support()
f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
print('F_classif selected {} features {}.\n'.format(f_classif_selected.sum(),
   f_classif_selected_features))
selected = chi2_selected & f_classif_selected
print('Chi2 & F_classif selected {} features.\n'.format(selected.sum()))
features = [ f for f,s in zip(X.columns, selected) if s]
print 'Randomly selected features:\n' 
print features
print '\n' 

# Make a dataframe with the selected features and their targets
X_sel = train[features + ['TARGET']]

Chi2 selected 12 features ['var15', 'ind_var5', 'ind_var8_0', 'ind_var30', 'num_var5', 'num_var8_0', 'num_var30_0', 'num_var30', 'num_var42', 'saldo_var30', 'var36', 'num_meses_var5_ult3'].

F_classif selected 12 features ['var15', 'ind_var5', 'ind_var8_0', 'ind_var30', 'num_var4', 'num_var5', 'num_var30', 'num_var35', 'num_var42', 'var36', 'num_meses_var5_ult3', 'n0'].

Chi2 & F_classif selected 9 features.

Randomly selected features:

['var15', 'ind_var5', 'ind_var8_0', 'ind_var30', 'num_var5', 'num_var30', 'num_var42', 'var36', 'num_meses_var5_ult3']




In [23]:
########################################
# Explore var36
########################################

# var36
X_sel['var36'].value_counts()

# var36 concetrates around 99 and {0,1,2,3}
sns.FacetGrid(train, hue="TARGET", size=6)    .map(sns.kdeplot, "var36").add_legend()
plt.title('If var36 is 0,1,2 or 3 => less unhappy customers');

# Density of unhappy custormers is lower when var36 is not 99
# var36 versus logvar38
sns.FacetGrid(train[~train.var38mc], hue="TARGET", size=10).map(plt.scatter, "var36", "logvar38").add_legend();

# Plot the above separately
sns.FacetGrid(train[(~train.var38mc) & (train.var36 < 4)], hue="TARGET", size=10).map(plt.scatter, "var36", "logvar38").add_legend()
plt.title('If var36==0, only happy customers');

# var36 == 99
sns.FacetGrid(train[(~train.var38mc) & (train.var36 ==99)], hue="TARGET", size=6)    .map(sns.kdeplot, "logvar38").add_legend();

# num_var5
train.num_var5.value_counts()
train[train.TARGET==1].num_var5.value_counts()
train[train.TARGET==0].num_var5.value_counts()

sns.FacetGrid(train, hue="TARGET", size=6).map(plt.hist, "num_var5").add_legend();
sns.FacetGrid(train, hue="TARGET", size=6).map(sns.kdeplot, "num_var5").add_legend();

plt.show()