In [None]:
%pylab inline --no-import-all

from __future__ import division, print_function
from datetime import datetime
import pandas as pd
import seaborn as sns
from sklearn.externals import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from map import mapk  ## to compute our loss function. Usage : mapk(actual:[[],[]],predicted:[[],[]],k:7)

pd.options.display.max_columns=50

## Load data

In [None]:
# Data is assumed to be in directory data/
with open('data/colnames.py', 'r') as myfile:
    columns_names_dict = eval(myfile.read())
data_train = pd.read_csv("data/sample_train.csv", parse_dates=[0,6,10], dtype={"cod_prov":str, "conyuemp":str},
                         skipinitialspace=True)#, nrows=1000000)
data_train.rename(columns=columns_names_dict, inplace=True)

data_train.head()

## Cleaning

In [None]:
# Looks like we've got outilers in SENIORITY_MNTH
data_train.SENIORITY_MNTH.replace(-999999, np.NaN, inplace=True)

In [None]:
# Transform datetime into integer
date_cols = data_train.columns[data_train.dtypes == data_train.FETCH_DATE.dtype]
data_train[date_cols] = data_train[date_cols].astype(int)

In [None]:
# Replace float NaNs by cust mean or global mean if not possible
col_to_fill = data_train.columns[np.logical_and(data_train.dtypes != object, data_train.isnull().any())]
fill_values = data_train.groupby("CUST_ID")[col_to_fill].mean()
fill_values_mean = fill_values.mean()
for col in col_to_fill:
    fill_values[col].fillna(fill_values_mean[col], inplace=True)

# Replace in data
data_train.set_index("CUST_ID", inplace=True)

for col in col_to_fill:
    data_col = data_train[col].copy()
    data_col.loc[data_train[col].isnull()] = fill_values[col]
    data_train[col] = data_col
    
data_train.reset_index(inplace=True)

# Data Visualization

In [None]:
data_train.describe()

## Age histogram

In [None]:
data_train.CUST_AGE.plot.hist(bins=int(data_train.CUST_AGE.max() - data_train.CUST_AGE.min() + 1))

## Income boxplots

In [None]:
plt.figure()
fig = sns.boxplot(x="CUSTIND_SEGMNT", y="CUST_HOUSINCOM", data = data_train)
fig.set_yscale('log')

In [None]:
plt.figure()
fig = sns.boxplot(x="EMPLYMT_STATUS", y="CUST_HOUSINCOM", data = data_train)
fig.set_yscale('log')

## Seniority

In [None]:
sns.boxplot(x=data_train.SENIORITY_MNTH)

## Profiles

In [None]:
profile_cols = [col for col in data_train.columns
                if not col.startswith("IND") and not col is "CUST_ID" and not col is "FETCH_DATE"]
print(profile_cols)
product_cols = [col for col in data_train.columns if not col is "CUST_ID" and not col is "FETCH_DATE" and not col in profile_cols]
print(product_cols)

In [None]:
all_dates = data_train["FETCH_DATE"].unique()
print(all_dates)
last_date=all_dates[16]

last_month_data = data_train[data_train.FETCH_DATE == last_date]
##last_month_data.rename(columns=lambda x: "L_MONTH_"+x, inplace=True)
last_month_data.shape

In [None]:
date_month_before = all_dates[15]
month_before_data= data_train[data_train.FETCH_DATE == date_month_before]
month_before_data.shape

In [None]:
joined = pd.merge(month_before_data,last_month_data,how="inner",on="CUST_ID")
joined.shape

In [None]:
data_cols = [col+"_x" for col in data_train.columns if col is not "CUST_ID"]
label_cols = ["got_"+col for col in product_cols]
for col in product_cols:
    joined["got_"+col]=((1-joined[col+"_x"])*joined[col+"_y"]).astype(int)
    
print(data_cols)
print(label_cols)

full_training_set = joined[data_cols].copy(deep = True)
full_label_set = joined[label_cols].copy(deep = True)

In [None]:
X_train, X_test, Y_train, y_split = train_test_split(full_training_set,full_label_set)


In [None]:
classifiers = {}
for label_col in label_cols:
    classifiers[label_col] = LogisticRegression()
    classifiers[label_col].fit(X_train,Y_train[label_col])
    print("Fitted "+label_col)