In [1]:
from collections import defaultdict, Counter
import gc

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import mean_squared_error

from functools import partial
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

import eli5
from eli5.sklearn import PermutationImportance

%matplotlib inline

Using TensorFlow backend.


In [2]:
train = pd.read_hdf('../../module3/input/train_online_retail.h5')
test = pd.read_hdf('../../module3/input/test_online_retail.h5')
df_all = pd.concat([train, test], sort=False)
del train, test
gc.collect()

df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1067371 entries, 12 to 541893
Data columns (total 11 columns):
invoice         1067371 non-null int32
stock_code      1067371 non-null int16
description     1062989 non-null object
quantity        1067371 non-null int32
invoice_date    1067371 non-null datetime64[ns]
price_unit      1067371 non-null float16
price_total     1067371 non-null float32
customer_id     1067371 non-null int16
country         1067371 non-null object
is_canceled     520142 non-null object
is_test         1067371 non-null bool
dtypes: bool(1), datetime64[ns](1), float16(1), float32(1), int16(2), int32(2), object(3)
memory usage: 60.1+ MB


In [3]:
train = df_all[ ~df_all['is_canceled'].isnull()]
description_canc = train.groupby(['stock_code', 'description'])['is_canceled'].agg(np.sum).reset_index()
description_canc['is_canceled'] = description_canc['is_canceled'].astype(int)
description_canc['description'] = description_canc['description'].map(lambda x: str(x).strip())
description_canc['words'] = description_canc['description'].map(lambda x: x.split(' '))

most_canceled_words = {}
for i in range(description_canc.shape[0]):
    word_lst = description_canc['words'].iloc[i]    
    for word in word_lst:
        curr_num = most_canceled_words.get(word, 0)
        
        most_canceled_words[word] = curr_num + description_canc['is_canceled'].iloc[i]

#sorted(most_canceled_words.items() ,  key=lambda x: x[1], reverse=True)

words_to_del = []
for word in most_canceled_words.keys():
    if len(word) <= 2:
        words_to_del.append(word)
for word in words_to_del:
    del most_canceled_words[word]
    
most_canceled_words = defaultdict(lambda: 0, most_canceled_words)

stock_code_word_score = {}
for i in range(description_canc.shape[0]):
    word_lst = description_canc['words'].iloc[i]
    stock_score = 0
    for word in word_lst:
        stock_score += most_canceled_words[word]
        
    stock_code_word_score[description_canc['stock_code'].iloc[i]] = stock_score
    
stock_code_word_score = defaultdict(lambda: 0, stock_code_word_score)

In [4]:
def group_to_dict(group_key, agg_func):
    print(type(group_key))
    train = df_all[ ~df_all['is_canceled'].isnull()]
    dict_ = train.groupby(group_key)['is_canceled'].agg(agg_func).to_dict()
    if -1 in dict_: del dict_[-1]
    
    if type(group_key) == list:
        keys_to_del = []
        for key in dict_.keys():
            if -1 in key:
                keys_to_del.append(key)
        for key in keys_to_del:
            del dict_[key]
    else:
        if -1 in dict_: del dict_[-1]
    
    mean = np.mean( list(dict_.values()) )
    return defaultdict(lambda: mean, dict_)

In [5]:
def group_to_dict(group_key, agg_func):
    train = df_all[ ~df_all['is_canceled'].isnull()]
    dict_ = train.groupby(group_key)['is_canceled'].agg(agg_func).to_dict()
    if -1 in dict_: del dict_[-1]
        
    mean = np.mean( list(dict_.values()) )
    return defaultdict(lambda: mean, dict_)

In [7]:
dict_cnt_customer_orders = group_to_dict('customer_id', agg_func=np.size)
dict_cnt_customer_cancel = group_to_dict('customer_id', agg_func=np.sum)
dict_cnt_product_orders = group_to_dict('stock_code', agg_func=np.size)
dict_cnt_product_cancel = group_to_dict('stock_code', agg_func=np.sum)
dict_cnt_customer_produckt_orders = group_to_dict(['customer_id','stock_code'], np.size)
dict_cnt_customer_produckt_cancel = group_to_dict(['customer_id','stock_code'], np.sum)

# customer feats
df_all['cnt_customer_orders'] = df_all['customer_id'].map(dict_cnt_customer_orders)
df_all['cnt_customer_cancel'] = df_all['customer_id'].map(dict_cnt_customer_cancel)
df_all['prc_customer_cancel'] = df_all[['cnt_customer_orders', 'cnt_customer_cancel']].apply(lambda x: x['cnt_customer_cancel'] / x['cnt_customer_orders'] if x['cnt_customer_orders'] != 0 else 0, axis=1)

# produkt feats
df_all['cnt_product_orders'] = df_all['stock_code'].map(dict_cnt_product_orders)
df_all['cnt_product_cancel'] = df_all['stock_code'].map(dict_cnt_product_cancel)
df_all['prc_product_cancel'] = df_all[['cnt_product_orders', 'cnt_product_cancel']].apply(lambda x: x['cnt_product_cancel'] / x['cnt_product_orders'] if x['cnt_product_orders'] != 0 else 0, axis=1)

# produckt per customer feats
# df_all['cnt_customer_produckt_orders'] = df_all['stock_code'].map(dict_cnt_product_orders)
# df_all['cnt_customer_produckt_cancel'] = df_all['stock_code'].map(dict_cnt_product_cancel)

# describsion feats
# df_all['stock_word_score'] = df_all['stock_code'].map(stock_code_word_score)

# date feats
df_all['invoice_year'] = df_all['invoice_date'].dt.year
df_all['invoice_month'] = df_all['invoice_date'].dt.month
df_all['invoice_day_of_m'] = df_all['invoice_date'].dt.day
df_all['invoice_day_of_y'] = df_all['invoice_date'].dt.dayofyear
df_all['invoice_hour'] = df_all['invoice_date'].dt.hour

# category feats
df_all['country__cat'] = pd.factorize(df_all['country'])[0]



# Train

In [8]:
def get_feats_X_y(df):
    
    # select feats
    black_list = ['is_canceled', 'is_test', 'total_return', 'is_canceled_pred', 'total_return', 'total_return_pred', 'predict_proba', 'mse']
    feats = df.select_dtypes([np.bool, np.number]).columns
    feats = [x for x in feats if x not in black_list]

    feats = [x for x in feats if x + '_log' not in df.columns]
                  
    X = df[feats].values
    y = df['is_canceled'].values
    
    return X, y, feats

In [9]:
def test_model(model, scoring='f1', check_distribution=False):
    cv = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    print('scores: {}'.format(scores))
    print('mean: {}, std: {} \n'.format(np.mean(scores), np.std(scores)))
    
    if check_distribution:
        y_pred = pd.Series(cross_val_predict(model, X, y, cv=cv))
        print('y_pred value_counts: \n{}'.format(y_pred.value_counts(normalize=True)))

        
        
def plot_model_char(model, check_confusion=True, check_feature_importance=True, check_learning_curve=True):
    
    if check_confusion:
        cv = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
        y_pred = cross_val_predict(model, X, y, cv=cv)
        skplt.metrics.plot_confusion_matrix(y, y_pred, normalize=True, figsize=(10 ,10))
    
    if check_feature_importance or check_learning_curve:
        model.fit(X, y)
    
    if check_feature_importance:
        skplt.estimators.plot_feature_importances(model, feature_names=feats, x_tick_rotation=90, figsize=(15, 5))
    
    if check_learning_curve:
        skplt.estimators.plot_learning_curve(model, X, y, figsize=(15, 5), cv=3, scoring='recall');

In [10]:
# prepare to train
train = df_all[ ~df_all['is_test'] ].copy()
test = df_all[ df_all['is_test'] ].copy()

#release memory
del df_all
gc.collect()

train['cnt_product_cancel'] = train['cnt_product_cancel'].astype(float)
train['cnt_customer_cancel'] = train['cnt_customer_cancel'].astype(float)

test['cnt_product_cancel'] = test['cnt_product_cancel'].astype(float)
test['cnt_customer_cancel'] = test['cnt_customer_cancel'].astype(float)

train['is_canceled'] = train['is_canceled'].astype(bool)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 520142 entries, 12 to 541909
Data columns (total 24 columns):
invoice                520142 non-null int32
stock_code             520142 non-null int16
description            517903 non-null object
quantity               520142 non-null int32
invoice_date           520142 non-null datetime64[ns]
price_unit             520142 non-null float16
price_total            520142 non-null float32
customer_id            520142 non-null int16
country                520142 non-null object
is_canceled            520142 non-null bool
is_test                520142 non-null bool
cnt_customer_orders    520142 non-null float64
cnt_customer_cancel    520142 non-null float64
prc_customer_cancel    520142 non-null float64
cnt_product_orders     520142 non-null float64
cnt_product_cancel     520142 non-null float64
prc_product_cancel     520142 non-null float64
stock_word_score       520142 non-null int64
invoice_year           520142 non-null int64
invoice_

# Order aproach

In [12]:
train.head()

Unnamed: 0,invoice,stock_code,description,quantity,invoice_date,price_unit,price_total,customer_id,country,is_canceled,...,cnt_product_orders,cnt_product_cancel,prc_product_cancel,stock_word_score,invoice_year,invoice_month,invoice_day_of_m,invoice_day_of_y,invoice_hour,country__cat
12,2,12,DOOR MAT BLACK FLOCK,10,2009-12-01 09:06:00,5.949219,59.5,1,United Kingdom,False,...,242.0,1.0,0.004132,0,2009,12,1,335,9,0
13,2,13,LOVE BUILDING BLOCK WORD,18,2009-12-01 09:06:00,5.449219,98.099998,1,United Kingdom,False,...,890.0,11.0,0.01236,159,2009,12,1,335,9,0
14,2,14,HOME BUILDING BLOCK WORD,3,2009-12-01 09:06:00,5.949219,17.85,1,United Kingdom,False,...,1081.0,11.0,0.010176,183,2009,12,1,335,9,0
15,2,15,ASSORTED COLOUR BIRD ORNAMENT,16,2009-12-01 09:06:00,1.69043,27.040001,1,United Kingdom,False,...,1384.0,10.0,0.007225,0,2009,12,1,335,9,0
16,2,16,PEACE WOODEN BLOCK LETTERS,3,2009-12-01 09:06:00,6.949219,20.85,1,United Kingdom,False,...,185.0,3.0,0.016216,243,2009,12,1,335,9,0


In [None]:
def group_orders(df, params_to_group_by, params_to_agg, agg_fun, label_in_df):
    orders = df.groupby(params_to_group_by)[params_to_agg].agg(agg_fun).reset_index()
    
    if label_in_df:
        orders['is_canceled'] = orders['is_canceled'].map(lambda x: True if x>0 else False)
#         orders['total_return'] = orders['price_total'] * orders['is_canceled']
    return orders

In [None]:
train.info()

In [None]:
params_to_group_by = ['invoice', 'customer_id', 'invoice_year', 'invoice_day_of_y', 'invoice_hour', 'country__cat']
params_to_sum = ['stock_code', 'price_total', 'is_canceled']
params_to_mean = ['cnt_customer_orders',  'cnt_customer_cancel',  'prc_customer_cancel', 'cnt_product_orders', 'cnt_product_cancel', 'prc_product_cancel', 'stock_word_score', 'cnt_customer_produckt_orders', 'cnt_customer_produckt_cancel', 'prc_customer_produckt_cancel']

In [None]:
train_group_sum = group_orders(train, params_to_group_by, params_to_sum, np.sum, True)
train_group_mean = group_orders(train, params_to_group_by, params_to_mean, np.mean, False)
test_group_sum = group_orders(test, params_to_group_by, params_to_sum, np.sum, False)
test_group_mean = group_orders(test, params_to_group_by, params_to_mean, np.mean, False)

In [None]:
# all agregations into one df
train_group = train_group_mean
test_group = test_group_mean

for param in params_to_sum:
    train_group[param] = train_group_sum[param]
    if param != 'is_canceled':
        test_group[param] = test_group_sum[param]

del train_group_sum, train_group_mean, test_group_sum, test_group_mean


In [None]:
train_group.head()

In [None]:
feats_to_log = ['cnt_customer_orders', 'cnt_customer_cancel', 'cnt_product_orders', 'cnt_product_cancel', 'price_total', 'cnt_customer_produckt_orders', 'cnt_customer_produckt_cancel']

for feat in feats_to_log:
    if feat in train_group.columns:
        train_group[feat + '_log'] = np.log1p(train_group[feat])
        if feat != 'total_return':
            test_group[feat + '_log'] = np.log1p(test_group[feat])

In [None]:
for feat in train_group.columns:
    if 'log' in feat or feat == 'is_canceled': continue
    plt.figure(figsize=(15,5))
    plt.subplot(1,2,1)
    train_group[feat].hist(bins=50)
    plt.title(feat)
    if feat in feats_to_log:
        plt.subplot(1,2,2)
        np.log1p(train_group[feat]).hist(bins=50)    
    plt.show()

In [None]:
train_group.head()

In [None]:
X, y, feats = get_feats_X_y(train_group)
feats

In [None]:
# # 'learning_rate': 0.05578989271866905, 'max_depth': 14.0
#         'colsample_bytree': hp.uniform ('colsample_bytree', 0.8, 1.),
#         'subsample': hp.uniform ('subsample', 0.7, 1.),

In [None]:
xgb_params = {
    'max_depth' : 2,
    'n_estimators' : 50,
    'learning_rate': 0.1,
    'colsample_bytree': 0.7,
    'subsample': 0.8,  
    'seed': 0
}
model = XGBClassifier(**xgb_params)
X, y, feats = get_feats_X_y(train_group)
test_model(model)

In [None]:
plot_model_char(model)

In [None]:
X, y, feats = get_feats_X_y(train_group)
m = XGBClassifier(**xgb_params)
m.fit(X,y)

imp = PermutationImportance(m, random_state=0).fit(X,y)
eli5.show_weights(imp, feature_names=feats)

In [None]:
def fit_and_predict_test(model, df):
    cv = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
    return cross_val_predict(model, df[feats], df['is_canceled'], cv=cv)

In [None]:
def pred_proba_SKFold(df, xgb_params):
        X, y, feats = get_feats_X_y(df)
        cv = StratifiedKFold(n_splits=3, random_state=0, shuffle=True)
        predict_proba = pd.Series([np.nan] * df.shape[0])
#         df['predict_proba'] = np.nan
        scores = []
        for train_idx, test_idx in cv.split(X, y):
            model = XGBClassifier(**xgb_params)
            model.fit(X[train_idx], y[train_idx])
            predict_proba[test_idx] = [x[1] for x in model.predict_proba(X[test_idx])]
            
        return predict_proba

In [None]:
# calculate mse on train xgb_params_opt_small  is_canceled_pred total_return total_return_pred
treshold = 0.5

model = XGBClassifier(**xgb_params)
if treshold is None:
    train_group['is_canceled_pred'] = fit_and_predict_test(model, train_group)
else:
    train_group['predict_proba'] = pred_proba_SKFold(train_group, xgb_params)
    train_group['is_canceled_pred'] = train_group['predict_proba'].map(lambda x: True if x > treshold else False)
    

train_group['total_return'] = train_group['price_total'] * train_group['is_canceled']

# train_group['total_return_pred'] = train_group['price_total'] * train_group['is_canceled_pred']
train_group['total_return_pred'] = train_group['price_total'] * train_group['predict_proba']


score = mean_squared_error(train_group['total_return'], train_group['total_return_pred'])
print(score)

# score by invoice
train_group['mse'] = train_group[['total_return', 'total_return_pred']].apply(lambda x: mean_squared_error([x['total_return']], [x['total_return_pred']]), axis=1)

# train_group['right_pred'] = train_group[['is_canceled', 'is_canceled_pred']].apply(lambda x: x['is_canceled'] == x['is_canceled_pred'], axis=1)
# train_group.groupby('right_pred')['price_total'].agg(np.sum) / train_group.groupby('right_pred')['price_total'].agg(np.size)

In [None]:
train_group.head()

In [None]:
train_group[['predict_proba', 'is_canceled_pred', 'mse']].sample(20)

In [None]:
pd.set_option('display.max_columns', 50)
train_group.sort_values(by='mse', ascending=False)

# Hyper opt

In [None]:
def hyperOptf1(train):
    X, y, feats = get_feats_X_y(train)


    def objective(space):

        xgb_params = {
            'max_depth': int(space['max_depth']),
            'learning_rate': space['learning_rate'],
            'colsample_bytree': space['colsample_bytree'],
            'subsample': space['subsample'],
            'min_child_weight': int(space['min_child_weight']),
            'n_estimators': 50,
            'objective': 'reg:squarederror',
            'seed':0
        }
        
    
        predict_proba = pred_proba_SKFold(train_group, xgb_params)
        return_pred = train['price_total'] * predict_proba
        final_score = mean_squared_error(return_pred, train['total_return'])
        
        print('final_score: {}'.format(final_score))
        return{'loss':final_score, 'status': STATUS_OK }

    space ={
        'max_depth': hp.quniform ('max_depth', 1, 20, 1),
        'colsample_bytree': hp.uniform ('colsample_bytree', 0.8, 1.),
        'subsample': hp.uniform ('subsample', 0.7, 1.),
        'learning_rate': hp.uniform ('learning_rate', 0.05, 0.3),
        'min_child_weight': hp.quniform ('min_child_weight', 1, 10, 1),
    }


    trials = Trials()
    best_params = fmin(fn=objective,
                space=space,
                algo=partial(tpe.suggest, n_startup_jobs=10),
                max_evals=50,
                trials=trials)

    print("The best params: ", best_params)

In [None]:
hyperOptf1(train_group)

In [None]:
feats

In [None]:
xgb_params_opt = {'n_estimators': 50, 'seed':0, 'learning_rate': 0.05068313243959119, 'max_depth': 6, 'min_child_weight': 1}

In [None]:
# final predict two models

X, y, feats = get_feats_X_y(train_group)

model = XGBClassifier(**xgb_params)
model.fit(train_group[feats], train_group['is_canceled'])
test_group['is_canceled'] = [x[1] for x in model.predict_proba(test_group[feats])]

importances = model.feature_importances_
for f, i in list(zip(feats, importances)):
    print(f,i)

In [None]:
#prepare submit
test_group['total_return'] = test_group['price_total'] * test_group['is_canceled']
test_group[ ['invoice', 'total_return'] ].to_csv('../../../output/xgb_and_cnt_features.csv', index=False)