### Introduction

 - Visual inspection of your data
- Defining the metadata
- Descriptive statistics
- Handling imbalanced classes
- Data quality checks
- Exploratory data visualization
- Feature engineering
- Feature selection
- Feature scaling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
test.shape

In [None]:
cat_cols = [col for col in train.columns if 'cat' in col]

In [None]:
cat_cols

In [None]:
train[cat_cols[0]].value_counts()

In [None]:
for col in cat_cols:
    print(col, train[col].nunique())

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.drop_duplicates()
train.shape

In [None]:
train.info()

### - Metadata

In [None]:
data = []
for f in train.columns:
    if f == 'target':
        role = 'target'
    elif f == 'id':
        role = 'id'
    else:
        role = 'input'
    
    if 'bin' in f or f == 'target':
        level = 'binary'
    elif 'cat' in f or f == 'id':
        level = 'nominal'
    elif train[f].dtype == float:
        level = 'interval'
    elif train[f].dtype == int:
        level = 'ordinal'
        
    keep = True
    if f == 'id':
        keep = False
        
    dtype = train[f].dtype
    
    f_dict = {
        'varname': f,
        'role': role,
        'level': level,
        'keep': keep,
        'dtype': dtype
    }
    data.append(f_dict)
    
meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype'])
meta.set_index('varname', inplace=True)

In [None]:
meta

In [None]:
meta[(meta.level == 'nominal') & (meta.keep)].index

In [None]:
pd.DataFrame({'count' : meta.groupby(['role', 'level'])['role'].size()}).reset_index()

### Interval variables

In [None]:
categorical_feats = [col for col in train.columns if 'cat' in col]

In [None]:
categorical_feats

In [None]:
v = meta[(meta.level == 'interval') & (meta.keep)].index
train[v].describe()

- only ps_reg_03 has missing values
- ps_car_12 and ps_car_15 have missing values

In [None]:
v = meta[(meta.level == 'binary') & (meta.keep)].index
train[v].describe()

## Handling imbalanced classes

- oversampling records with target = 1
- undersampling records with target = 0

In [None]:
desired_apriori = 0.1

idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])

undersampling_rate = ((1 - desired_apriori) * nb_1)/(nb_0 * desired_apriori)
undersampled_nb_0 = int(undersampling_rate * nb_0)
print('Rate to undersample records with target = 0: {}'.format(undersampling_rate))
print('Number of records with target = 0 after undersampling: {}'.format(undersampled_nb_0))

undersampled_idx = shuffle(idx_0, random_state=37, n_samples=undersampled_nb_0)

idx_list = list(undersampled_idx) + list(idx_1)

train = train.loc[idx_list].reset_index(drop=True)

### Data Quality Checks

In [None]:
vars_with_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings / train.shape[0]
        
        print('Variable {} has {} records ({:.2%}) with missing values'.format(f, missings, missings_perc))
        
print('In total, there are {} variables with missing values'.format(len(vars_with_missing)))

In [None]:
vars_to_drop = ['ps_car_03_cat', 'ps_car_05_cat']
train.drop(vars_to_drop, axis = 1, inplace = True)
meta.loc[(vars_to_drop),'keep'] = False

In [None]:
mean_imp = SimpleImputer(missing_values=-1, strategy='mean')
mode_imp = SimpleImputer(missing_values=-1, strategy='most_frequent')

train['ps_reg_03'] = mean_imp.fit_transform(train[['ps_reg_03']]).ravel()
train['ps_car_12'] = mean_imp.fit_transform(train[['ps_car_12']]).ravel()
train['ps_car_14'] = mean_imp.fit_transform(train[['ps_car_14']]).ravel()
train['ps_car_11'] = mode_imp.fit_transform(train[['ps_car_11']]).ravel()

In [None]:
mean_imp.fit_transform(train[['ps_reg_03']]).ravel()

In [None]:
mode_imp.fit_transform(train[['ps_car_11']]).ravel()

#### Checking the cardinality of the categorical variables

In [None]:
v = meta[(meta.level == 'nominal') & (meta.keep)].index
sum = 0

for f in v:
    dist_values = train[f].value_counts().shape[0]
    sum += dist_values
    print('Variable {} has {} distinct values'.format(f, dist_values))

In [None]:
sum

In [None]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    
    
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    
    prior = target.mean()
    
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [None]:
train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], 
                             test["ps_car_11_cat"], 
                             target=train.target, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    
train['ps_car_11_cat_te'] = train_encoded
train.drop('ps_car_11_cat', axis=1, inplace=True)
meta.loc['ps_car_11_cat','keep'] = False  # Updating the meta
test['ps_car_11_cat_te'] = test_encoded
test.drop('ps_car_11_cat', axis=1, inplace=True)

In [None]:
# def add_noise(series, noise_level):
#     return series * (1 + noise_level * np.random.randn(len(series)))

# def target_encode(trn_series = None,
#                   tst_series = None,
#                   target = None,
#                   min_samples_leaf = 1,
#                   smoothing = 1,
#                   noise_level = 0):
    
    
    
#     assert len(trn_series) == len(target)
#     assert trn_series.name == tst_series.name
#     temp = pd.concat([trn_series, target], axis=1)
#     # Compute target mean
#     averages = temp.groupby(by=trn_series.name)[target.name].agg(['mean', 'count'])
    
#     # Compute smoothing
#     smoothing = 1 / (1 + np.exp(-(averages['count'] - min_samples_leaf) / smoothing))
    
#     # Apply average funtion to all target data
#     prior = target.mean()
    
#     #The bigger the count the less full_avg is taken into account
#     averages[target.name] = prior * (1 - smoothing) + averages['mean'] * smoothing
#     averages.drop(['mean', 'count'], axis=1, inplace=True)
    
#     # Apply averages to trn and tst series
#     ft_trn_series = pd.merge(
#         trn_series.to_frame(trn_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#         on = trn_series.name,
#         how = 'left')['average'].rename(trn_series.name + '_mean').fillna(prior)
#     # pd.merge does not keep the index so restore it
#     ft_trn_series.index = trn_series.index
#     ft_tst_series = pd.merge(
#         tst_series.to_frame(tst_series.name),
#         averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
#         on = tst_series.name,
#         how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
#     # pd.merge does not keep the index so restore it
#     ft_tst_series.index = tst_series.index
#     return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [None]:
# train_encoded, test_encoded = target_encode(train['ps_car_11_cat'],
#                               test['ps_car_11_cat'],
#                               target = train.target,
#                               min_samples_leaf=100,
#                               smoothing=10,
#                               noise_level=0.01)

# train['ps_car_11_cat_te'] = train_encoded
# train.drop('ps_car_11_cat', axis=1, inplace=True)

# meta.loc['ps_car_11_cat','keep'] = False
# test['ps_car_11_cat_te'] = test_encoded
# test.drop('ps_car_11_cat', axis=1, inplace=True)

## EDA

### Exploratory Data Visualization

Categorical variables

In [None]:
v = meta[(meta.level == 'nominal') & (meta.keep)].index

In [None]:
v

In [None]:
f

In [None]:
sns.set(font_scale = 2)

In [None]:
for f in v:
    plt.figure()

    fig, ax = plt.subplots(figsize=(20, 10))

    cat_perc = train[[f, 'target']].groupby([f], as_index=False).mean()
    cat_perc.sort_values(by='target', ascending=False, inplace=True)

    sns.barplot(data=cat_perc, x=f, y='target', order=cat_perc[f])
    plt.ylabel('% target')
    plt.xlabel(f)
    plt.tick_params(axis='both', which='major')
    plt.show();

In [None]:
cat_perc

In [None]:
f = 'ps_car_02_cat'

In [None]:
cat_perc = train[[f, 'target']].groupby([f], as_index=False).agg(['mean', 'count'])
# cat_perc.sort_values(by='target', ascending=False, inplace=True)

In [None]:
cat_perc

## Interval variables

In [None]:
v = meta[(meta.level == 'interval') & (meta.keep)].index

In [None]:
v

In [None]:
correlations = train[v].corr()

In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)

In [None]:
sns.set(font_scale=0.7)

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt='.2f',
                square=True, linewidths=.5, annot=True, cbar_kws={'shrink': .75})
plt.show()

There are a strong correlations between the variables:

- ps_reg_02 and ps_reg_03 (0.7)
- ps_car_12 and ps_car13 (0.67)
- ps_car_12 and ps_car14 (0.58)
- ps_car_13 and ps_car15 (0.67)

In [None]:
s = train.sample(frac=0.1)

In [None]:
sns.lmplot(data=s, x='ps_reg_02', y='ps_reg_03'
          , hue='target', palette='Set1', scatter_kws={'alpha':0.2})
plt.show()

In [None]:
sns.lmplot(data=s, x='ps_car_12', y='ps_car_13'
          , hue='target', palette='Set1', scatter_kws={'alpha':0.2})
plt.show()

In [None]:
sns.lmplot(data=s, x='ps_car_12', y='ps_car_14'
           , hue='target', palette='Set1', scatter_kws={'alpha':0.2})
plt.show()

In [None]:
sns.lmplot(data=s, x='ps_car_15', y='ps_car_13'
           , hue='target', palette='Set1', scatter_kws={'alpha':0.3})
plt.show()

## Feature engineering

- Creating dummy variables

In [None]:
v = meta[(meta.level == 'nominal') & (meta.keep)].index

In [None]:
train.shape[1]

In [None]:
train = pd.get_dummies(train, columns=v, drop_first=True)

In [None]:
train.shape

- Creating interaction variables

In [None]:
v = meta[(meta.level =='interval') & (meta.keep)].index

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

In [None]:
interactions = pd.DataFrame(data=poly.fit_transform(train[v]), columns=poly.get_feature_names_out(v))

In [None]:
interactions.drop(v, axis=1, inplace=True)

In [None]:
train.shape

In [None]:
train = pd.concat([train, interactions], axis=1)

## Feature selection

In [None]:
selector = VarianceThreshold(threshold=0.01)

In [None]:
selector.fit(train.drop(['id', 'target'], axis=1))

In [None]:
v = train.drop(['id', 'target']. axis=1)

In [None]:
f = np.vectorize(lambda x: not x) # -> ~

In [None]:
print('{} variables have too low variance.'.format(len(v)))
print('These variables are {}'.format(list(v)))

In [None]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']

In [None]:
feat_labels = X_train.columns

In [None]:
rf = RandomForestClassifier(n_estimators=200000, random_state=0, n_jobs=-1)

In [None]:
rf.fit(X_train[:1000], y_train[:1000])

In [None]:
importances = rf.feature_importances_

In [None]:
rf.get_params()

In [None]:
indices = np.argsort(rf.feature_importances_)[::-1]

In [None]:
indices

In [None]:
for f in range(X_train.shape[1]):
    print('%2d %-*s %f' % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))