In [None]:
%pylab inline --no-import-all

from __future__ import division, print_function
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

pd.options.display.max_columns=50

## Load data

In [None]:
# Data is assumed to be in directory data/
with open('data/colnames.py', 'r') as myfile:
    columns_names_dict = eval(myfile.read())
data_train = pd.read_csv("data/train_ver2.csv", parse_dates=[0,6,10], dtype={"cod_prov":str, "conyuemp":str}, skipinitialspace=True, nrows=1000000)
data_train.rename(columns=columns_names_dict, inplace=True)

data_train.head()

## Cleaning

In [None]:
# Looks like we've got outilers in SENIORITY_MNTH
data_train.SENIORITY_MNTH.replace(-999999, np.NaN, inplace=True)

In [None]:
# Transform datetime into integer
date_cols = data_train.columns[data_train.dtypes == data_train.FETCH_DATE.dtype]
data_train[date_cols] = data_train[date_cols].astype(int)

In [None]:
# Replace float NaNs by cust mean or global mean if not possible
col_to_fill = data_train.columns[np.logical_and(data_train.dtypes != object, data_train.isnull().any())]
fill_values = data_train.groupby("CUST_ID")[col_to_fill].mean()
fill_values_mean = fill_values.mean()
for col in col_to_fill:
    fill_values[col].fillna(fill_values_mean[col], inplace=True)

# Replace in data
data_train.set_index("CUST_ID", inplace=True)

for col in col_to_fill:
    data_train[col].loc[data_train[col].isnull()] = fill_values[col]
    
data_train.reset_index(inplace=True)

# Data Visualization

In [None]:
data_train.describe()

## Age histogram

In [None]:
data_train.CUST_AGE.plot.hist(bins = len(data_train.CUST_AGE.unique()))

## Income boxplots

In [None]:
plt.figure()
fig = sns.boxplot(x="CUSTIND_SEGMNT", y="CUST_HOUSINCOM", data = data_train)
fig.set_yscale('log')

In [None]:
plt.figure()
fig = sns.boxplot(x="EMPLYMT_STATUS", y="CUST_HOUSINCOM", data = data_train)
fig.set_yscale('log')

## Seniority

In [None]:
sns.boxplot(x=data_train.SENIORITY_MNTH)

## Profiles PCA

In [None]:
profile_cols = [col for col in data_train.columns
                if not col.startswith("IND") and not col is "CUST_ID" and not col is "FETCH_DATE"]

In [None]:
data_profiles = pd.get_dummies(data_train[profile_cols],
                               columns=[col for col in profile_cols if data_profiles[col].dtype == object],
                               drop_first=True)
data_profiles.describe()

In [None]:
pipeline = Pipeline([('scaling', Normalizer()), ('pca', PCA(n_components=100))])
profiles_pca = pipeline.fit_transform(data_profiles)