In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
import seaborn as sns

In [38]:
np.random.seed(0)
plt.rcParams['font.size'] = 8
plt.rcParams['legend.fontsize'] = 'small'
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
# Sort colors by hue, saturation, value and name.
by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), 
                       name)
                for name, color in colors.items())
colors_names = [name for hsv, name in by_hsv]

In [21]:
def reduce_mem_usage(props, fillna=False):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("[*]Col: ",col)
            dtype_before = props[col].dtype
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                if not fillna:
                    print('>>Skip(NA exist)')
                    continue
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            dtype_after = props[col].dtype
            print(">>{} -> {}".format(dtype_before,
                                            dtype_after))
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [5]:
trn = pd.read_csv('data/train_ver2.csv', low_memory=False)
tst = pd.read_csv('data/test_ver2.csv', low_memory=False)
prods = trn.columns[24:].tolist()

In [22]:
trn, _ = reduce_mem_usage(trn)
tst, _ = reduce_mem_usage(tst)

Memory usage of properties dataframe is : 2941.4099826812744  MB
[*]Col:  ncodpers
>>uint32 -> uint32
[*]Col:  age
>>Skip(NA exist)
[*]Col:  ind_nuevo
>>Skip(NA exist)
[*]Col:  indrel
>>Skip(NA exist)
[*]Col:  tipodom
>>Skip(NA exist)
[*]Col:  cod_prov
>>Skip(NA exist)
[*]Col:  ind_actividad_cliente
>>Skip(NA exist)
[*]Col:  renta
>>Skip(NA exist)
[*]Col:  ind_ahor_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_aval_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_cco_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_cder_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_cno_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_ctju_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_ctma_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_ctop_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_ctpp_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_deco_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_deme_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_dela_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_ecue_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_fond_fin_ult1
>>uint8 -> uint8
[*]Col:  ind_hip_fin_ult1
>>uint8 ->

In [16]:
print(trn.info())
print(trn.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 48 columns):
fecha_dato               object
ncodpers                 uint32
ind_empleado             object
pais_residencia          object
sexo                     object
age                      float64
fecha_alta               object
ind_nuevo                float64
antiguedad               object
indrel                   float64
ult_fec_cli_1t           object
indrel_1mes              object
tiprel_1mes              object
indresi                  object
indext                   object
conyuemp                 object
canal_entrada            object
indfall                  object
tipodom                  float64
cod_prov                 float64
nomprov                  object
ind_actividad_cliente    float64
renta                    float64
segmento                 object
ind_ahor_fin_ult1        uint8
ind_aval_fin_ult1        uint8
ind_cco_fin_ult1         uint8
ind_cder_fin_ult1

In [23]:
nouse_cols = ['ult_fec_cli_1t', 'conyuemp']

In [24]:
trn['age'] = pd.to_numeric(trn['age'], errors="coerce")
tst['age'] = pd.to_numeric(tst['age'], errors="coerce")

In [53]:
age_na   = trn['age'].isnull()
trn[age_na][:10]

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,indrel_1mes,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,tipodom,cod_prov,nomprov,ind_actividad_cliente,renta,segmento,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
261,2015-01-28,1050741,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
1029,2015-01-28,1051017,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
1063,2015-01-28,1051064,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
1154,2015-01-28,1051387,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
1779,2015-01-28,1048660,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
1850,2015-01-28,1049775,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1
1867,2015-01-28,1049774,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
1886,2015-01-28,1049838,,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0
1922,2015-01-28,1049700,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,0
2142,2015-01-28,1049406,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0


In [None]:
df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

In [None]:
df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)

In [None]:
df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)

In [None]:
features += ['age', 'antiguedad', 'renta', 'ind_nuevo', 'indrel', 'indrel_1mes',
            'int_actividad_cliente']

In [None]:
df['fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float 
                                                  else float(x.split('-')[1])).astype(np.int8)
df['fecha_alta_year'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float 
                                                  else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month', 'fecha_alta_year']


In [None]:
df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float 
                                                                    else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float
                                                                    else float(x.split('-')[0])).astype(np.int16)

features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']

df.fillna(-99, inplace=True)

In [None]:
features = []
categorical_cols = ['ind_empleado', 'pais_residencia', 
                    'sexo', 'tiprel_1mes', 'indresi',
                   'indext', 'conyuemp', 'canal_entrada', 
                    'indfall', 'tipodom', 'nomprov', 'segmento']
for col in categorical_cols:
    trn[col], _ = trn[col].factorize(na_sentinel = -99)
    tst[col], _ = tst[col].factorize(na_sentinel = -99)
features += categorical_cols
print('[*]done factorize')
cols_nan = trn.columns[trn.isna().any()].tolist()
trn[cols_nan].info()

In [None]:
cols_nan = df.columns[df.isna().any()].tolist()
df[cols_nan].info()

In [None]:
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")]
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

In [None]:
df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)

In [None]:
standard_cols = ['int_date', 'ncodpers']
df_trn = df[standard_cols+prods].copy()

In [None]:
print(df.columns)
print(df_trn.columns)

In [None]:
for prod in prods:
    print('[*]process {}...'.format(prod))
    trn_leg = standard_cols + [prod]
    trn_bk = df[trn_leg].copy()
    trn_bk['int_date'] += 1
    trn_bk.columns = [col+'_prev' if col == prod else col for col in trn_leg]
    df_trn = df_trn.merge(trn_bk, on=standard_cols, how='left')
    print(df_trn.columns)
    df = df.drop([prod], axis=1)

In [None]:
df = df.drop(prods, axis=1)

In [None]:
df_trn.columns

In [None]:
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
df_trn.fillna(-99, inplace=True)

In [None]:
df_trn = df_trn.merge(df, on=standard_cols, how='left')

In [None]:
df_trn.columns

In [None]:
df_trn.memory_usage()

In [None]:
del df

In [None]:
for prod in prods:
    padd = prod + '_add'
    prev = prod + '_prev'
    df_trn[padd] = ((df_trn[prod]==1) & (df_trn[prev] == 0)).astype(np.int8)

In [None]:
df_trn.to_csv('data/df_trn.csv', index=False)

In [None]:
print('saved')

In [None]:
df_trn.info()

In [None]:
df_trn.isnull().any()

In [None]:
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']
del df_trn

In [None]:
X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)

In [None]:
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y

In [None]:
vld_date = '2016-02-58'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

In [None]:
params = {
    'booster' : 'gbtree',
    'max_depth' : 8,
    'nthread': 4,
    'num_class' : len(prods),
    'objective' : 'multi:softprob',
    'silent': 1,
    'eval_metric' : 'mlogloss',
    'eta' : 0.1,
    'min_child_weight' : 10,
    'colsample_bytree' : 0.8,
    'colsample_bylevel' : 0.9,
    'seed' : 0
}

In [None]:
X_trn = XY_trn.as_matrix(columns=features)
Y_trn = XY_trn.as_matrix(columns=['y'])
dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names = features)

In [None]:
X_vld = XY_vld.as_matrix(columns=features)
Y_vld = XY_vld.as_matrix(columns=['y'])

In [None]:
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

In [None]:
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(params, dtrn, num_boost_round=1000, evals=watch_list,
                 early_stopping_rounds=20)