# XGBoost on GPU
## Predicting the time difference (weeks) to next repurchase

### Methods & Settings

In [25]:
from IPython.display import HTML
from IPython.display import display

# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
tag = HTML('''<script>
code_show=false; 
function code_toggle() {
    if (code_show){
        $('div.cell.code_cell.rendered.selected div.input').hide();
    } else {
        $('div.cell.code_cell.rendered.selected div.input').show();
    }
    code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<a href="javascript:code_toggle()">HIDE/SHOW CONTENT</a>.''')
display(tag)

############### Write code below ##################

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib
import math

import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:75% !important; }</style>"))

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_columns', 50)

####
# prints memory usage
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))
    return

####
# seperates features from label (y must be last column)
def sep_X_y(df):
    X = df.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y = df.iloc[:,-1] # extracts all rows [:] and only last column [-1]
    
    return [X, y]

####
# split training and test set from given dataframe with month as boundaries
def mth_train_test_split(df, mth_start, mth_end):
    print('Splitting dataframe...\n')
    
    # get indices from desired boundaries
    idx_start = df.month.searchsorted(mth_start_train, side='left') # list needs to be sorted already for searchsorted
    idx_end = df.month.searchsorted(mth_end_train + 1, side='left')
    
    df = df.iloc[idx_start:idx_end]
    
    return df

####
# trains XGB model (regressor)
def train_xgb(X, y):
    
    print('Fitting model...\n')
    model = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
    fitted_model = model.fit(X, y)
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain', max_num_features=25)
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# trains LinearRegression model
def train_linReg(X, y):
    
    print('Fitting model...\n')
    model = LinearRegression()
    fitted_model = model.fit(X, y)
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
#    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
#     xgb.plot_importance(model, importance_type='gain', max_num_features=25)
#     plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# trains XGB model (regressor)
def train_lgbm(X, y):
    
    print('Fitting model...\n')
    model = LGBMRegressor(boosting_type='gbdt', device="gpu")
    fitted_model = model.fit(X, y)
    
    print('Plotting feature importance for "gain".')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    lgb.plot_importance(model, importance_type='gain', max_num_features=25)
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model


def train_xgb_bestHyper(X, y):
    X_train = X
    y_train = y
    space = best_hyperparams
    
    print('Fitting model...\n')
    model = XGBRegressor(tree_method='gpu_hist', gpu_id=0,
                    eta = space['eta'],
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))

    
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    fitted_model.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="rmse",
            early_stopping_rounds=10,verbose=False)
    
    print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain')
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

def train_dtc(X, y):
    X_train = X
    y_train = y
    
    print('Fitting model...\n')
    model = DecisionTreeRegressor()
    fitted_model = model.fit(X_train, y_train)
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    #xgb.plot_importance(model, importance_type='gain')
    #plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# predicts labels of training and test with given model
def predict_values(model, X, y_true):
    print('Predicting values...\n')
    # predict y values
    y_pred = model.predict(X)
    
    # get msq
    model_error = mean_squared_error(y_true, y_pred)
    
    # print info about accuracies
    print(f'\t\t\t\t\t\033[1m XGboost Regressor MSE: '
          f'\033[4m\033[1m {model_error:.3f}\n')
    
    print(f'\t\t\t\t\t\033[1m XGboost Regressor RMSE: '
          f'\033[4m\033[1m {math.sqrt(model_error):.3f}')
    
    # return predicted values
    return y_pred

####
# concatenates prediction with actual target for evaluation
def concat_ytrue_ypred(X, y_true, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['nextBuyIn_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y_true, df_y_pred], axis=1)
    
    return df_eval

####
# executes all needed functions of the above with given training and test data and provided train method
# def execute_pipeline(train_method, df, start_mth, end_mth):
#     b = list_of_four_df_boundaries
#     # split dataframe in train/test and X/y
#     X_train, y_train, X_test, y_test = dt_train_test_split(df, b[0], b[1], b[2], b[3])
    
#     #train model
#     model = train_method(X_train, y_train)    
    
#     # make predictions
#     pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)
    
#     print('\nExecuted pipeline.\nEvaluate with "evaluate_pred(X, y, y_pred)"\n')
#     return [pred_train, pred_test, X_train, y_train, X_test, y_test]

# <font color='purple'>Predicting Weeks w/o normalization + categories multihot (Train/Test)</color>


In [49]:
train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_complete_feature-list_orderhistory_trainingOhneNull.csv'
predset = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_complete_feature-list_orderhistory_testNurNull.csv'

columns = [#'date',
           'userID', 
           'itemID',
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories',
           'brandOrderRatio',
           'feature1OrderRatio',
           'feature2OrderRatio',
           'feature3OrderRatio',
           'feature4OrderRatio',
           'feature5OrderRatio',
           'TotalBFscore',
           'RCP',
           'MeanDiffToNxt(user)',
           'TotalItemOrders(user)',
           #'TotalItemOrders(item)',
           'date(year)',
           'date(month)',
           'date(weekOfMonth)',
           'date(dayOfMonth)',
           'date(weekOfYear)',
           'date(dayOfYear)',
           #'nextBuyInWeeks(round)', # label
           'nextBuyInWeeks(floor)', # label
           #'nextBuyInWeekOfYear' # label; schlechte idee
          ]

dtype = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8,
         'nextBuyInWeeks(floor)':np.uint8
        }

label = 'nextBuyInWeeks(floor)'

## Preparation

In [50]:
df_train = pd.read_csv(train, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

df_test = pd.read_csv(predset, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

# add fake column for ensuring all categories from 0 to 4299 are included
df_train.loc[len(df_train)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_train.columns]
df_train.index = df_train.index + 1  # add index

df_test.loc[len(df_test)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_test.columns]
df_test.index = df_test.index + 1  # add index

df = df_train
show_mem_usage(df)

Memory usage of dataframe is 37.41 MB



In [51]:
# multi-hot-encode categories
cats = df["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df.drop(index=df.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df = df.join(df_multi_hot, how='inner')

if (len(df[df.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df.pop(label)
df.insert(len(df.columns), col.name, col)

del df_multi_hot
gc.collect()

df

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4276,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor)
1,76,23050,1,1411,4,0,22,0,151,0.007899,0.466804,0.826492,0.008540,0.640224,0.018705,0.940897,0.259374,2,169.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24
2,116,9408,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.040000,3,114.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22
3,116,25677,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.207143,3,114.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22
4,135,13660,1,157,4,0,513,0,137,0.010361,0.466804,0.826492,0.004528,0.640224,0.005142,0.933540,0.055556,2,32.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
5,135,22174,1,504,10,0,441,3,84,0.005653,0.369146,0.826492,0.005184,0.334600,0.050564,0.757337,0.454545,3,40.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
6,202,26940,1,1258,4,0,487,3,44,0.001213,0.466804,0.826492,0.024224,0.334600,0.059138,0.816166,0.123867,3,77.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11
7,240,7318,1,1335,6,0,421,3,6,0.000549,0.152436,0.826492,0.021874,0.334600,0.002002,0.633827,0.279570,2,71.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10
8,240,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,2,71.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10
9,244,10341,1,1025,6,0,198,0,17,0.002099,0.152436,0.826492,0.000869,0.640224,0.078879,0.810581,0.228986,3,109.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15
10,276,15667,1,1201,4,0,30,0,163,0.012826,0.466804,0.826492,0.001585,0.640224,0.017563,0.939354,0.138325,8,51.750000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8


In [52]:
# save column names
column_headers = list(df.columns)

In [53]:
# split DF in X & y
X_train, y_train = sep_X_y(df)
X_train

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4275,4276,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299
1,76,23050,1,1411,4,0,22,0,151,0.007899,0.466804,0.826492,0.008540,0.640224,0.018705,0.940897,0.259374,2,169.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,116,9408,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.040000,3,114.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,116,25677,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.207143,3,114.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,135,13660,1,157,4,0,513,0,137,0.010361,0.466804,0.826492,0.004528,0.640224,0.005142,0.933540,0.055556,2,32.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,135,22174,1,504,10,0,441,3,84,0.005653,0.369146,0.826492,0.005184,0.334600,0.050564,0.757337,0.454545,3,40.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,202,26940,1,1258,4,0,487,3,44,0.001213,0.466804,0.826492,0.024224,0.334600,0.059138,0.816166,0.123867,3,77.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,240,7318,1,1335,6,0,421,3,6,0.000549,0.152436,0.826492,0.021874,0.334600,0.002002,0.633827,0.279570,2,71.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,240,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,2,71.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,244,10341,1,1025,6,0,198,0,17,0.002099,0.152436,0.826492,0.000869,0.640224,0.078879,0.810581,0.228986,3,109.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,276,15667,1,1201,4,0,30,0,163,0.012826,0.466804,0.826492,0.001585,0.640224,0.017563,0.939354,0.138325,8,51.750000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Training & Prediction

## XGBoost Regressor

Pipeline needs training method, dataframe and dates to split dataframe in training and test set.

In [54]:
model = train_linReg(X_train, y_train)
#model = train_dtc(X_train, y_train)

Fitting model...



In [55]:
y_pred = predict_values(model, X_train, y_train)

Predicting values...

					[1m XGboost Regressor MSE: [4m[1m 6.192

					[1m XGboost Regressor RMSE: [4m[1m 2.488


### Evaluation

In [56]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_train.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)

In [57]:
# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_train, y_train, y_pred], axis=1)
df_eval

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,76,23050,1,1411,4,0,22,0,151,0.007899,0.466804,0.826492,0.008540,0.640224,0.018705,0.940897,0.259374,2,169.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,24
2,116,9408,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.040000,3,114.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,16
3,116,25677,1,322,4,0,536,0,144,0.012288,0.466804,0.826492,0.034208,0.640224,0.086085,0.988336,0.207143,3,114.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,16
4,135,13660,1,157,4,0,513,0,137,0.010361,0.466804,0.826492,0.004528,0.640224,0.005142,0.933540,0.055556,2,32.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,5
5,135,22174,1,504,10,0,441,3,84,0.005653,0.369146,0.826492,0.005184,0.334600,0.050564,0.757337,0.454545,3,40.500000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,6
6,202,26940,1,1258,4,0,487,3,44,0.001213,0.466804,0.826492,0.024224,0.334600,0.059138,0.816166,0.123867,3,77.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,11
7,240,7318,1,1335,6,0,421,3,6,0.000549,0.152436,0.826492,0.021874,0.334600,0.002002,0.633827,0.279570,2,71.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,10
8,240,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,2,71.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,11
9,244,10341,1,1025,6,0,198,0,17,0.002099,0.152436,0.826492,0.000869,0.640224,0.078879,0.810581,0.228986,3,109.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,16
10,276,15667,1,1201,4,0,30,0,163,0.012826,0.466804,0.826492,0.001585,0.640224,0.017563,0.939354,0.138325,8,51.750000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8


In [58]:
rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

[1mrow count of set:				 175112
[1mrows where label was predicted correctly:	 70676 	 (40.360 % of rows)


## Prediction on Predictionset

In [59]:
# multi-hot-encode categories
cats = df_test["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df_test.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df_test.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df_test.drop(index=df_test.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df_test = df_test.join(df_multi_hot, how='inner')

if (len(df_test[df_test.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df_test.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df_test.pop(label)
df_test.insert(len(df_test.columns), col.name, col)

del df_multi_hot
gc.collect()

df_test

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4276,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor)
1,4,18860,1,603,10,0,536,3,147,0.001617,0.369146,0.826492,0.034208,0.334600,0.004699,0.747174,0.029126,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4,30779,1,406,10,1,503,3,17,0.010606,0.369146,0.100607,0.062924,0.334600,0.078879,0.448239,0.078261,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,20,18613,2,1111,4,3,444,3,11,0.011587,0.466804,0.056881,0.005516,0.334600,0.003090,0.410125,0.000000,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,55,9547,1,671,10,0,506,0,17,0.001884,0.369146,0.826492,0.004327,0.640224,0.078879,0.917668,0.090395,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,55,10844,1,1180,10,0,192,0,96,0.021251,0.369146,0.826492,0.002211,0.640224,0.002630,0.888944,0.047619,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,55,17912,1,342,6,0,190,0,96,0.002499,0.152436,0.826492,0.000650,0.640224,0.002630,0.773546,0.113636,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,55,24763,1,186,6,0,207,0,17,0.042279,0.152436,0.826492,0.004937,0.640224,0.078879,0.832124,0.087538,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,76,2787,1,1324,10,0,421,3,3,0.008698,0.369146,0.826492,0.021874,0.334600,0.023124,0.753586,0.182783,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,76,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,89,6287,1,1455,6,2,455,0,122,0.000002,0.152436,0.016020,0.002230,0.640224,0.028051,0.390886,0.000000,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [60]:
X_test, y_test = sep_X_y(df_test)

In [61]:
y_pred = model.predict(X_test)

In [62]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_test.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)

In [63]:
# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_test, y_test, y_pred], axis=1)
df_eval

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,4,18860,1,603,10,0,536,3,147,0.001617,0.369146,0.826492,0.034208,0.334600,0.004699,0.747174,0.029126,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,4,30779,1,406,10,1,503,3,17,0.010606,0.369146,0.100607,0.062924,0.334600,0.078879,0.448239,0.078261,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,20,18613,2,1111,4,3,444,3,11,0.011587,0.466804,0.056881,0.005516,0.334600,0.003090,0.410125,0.000000,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,55,9547,1,671,10,0,506,0,17,0.001884,0.369146,0.826492,0.004327,0.640224,0.078879,0.917668,0.090395,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5,55,10844,1,1180,10,0,192,0,96,0.021251,0.369146,0.826492,0.002211,0.640224,0.002630,0.888944,0.047619,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6,55,17912,1,342,6,0,190,0,96,0.002499,0.152436,0.826492,0.000650,0.640224,0.002630,0.773546,0.113636,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
7,55,24763,1,186,6,0,207,0,17,0.042279,0.152436,0.826492,0.004937,0.640224,0.078879,0.832124,0.087538,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8,76,2787,1,1324,10,0,421,3,3,0.008698,0.369146,0.826492,0.021874,0.334600,0.023124,0.753586,0.182783,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,76,26645,1,648,10,0,358,3,24,0.001713,0.369146,0.826492,0.009509,0.334600,0.004315,0.735008,0.106439,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,89,6287,1,1455,6,2,455,0,122,0.000002,0.152436,0.016020,0.002230,0.640224,0.028051,0.390886,0.000000,0,0.000000,2020,6,1,1,23,153,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [71]:
df_eval.loc[df_eval['date(month)'] == 12].tail()

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
733135,46095,8385,8,18,10,0,345,0,95,0.042962,0.369146,0.826492,0.048861,0.640224,0.036723,0.938825,0.110638,0,0.0,2020,12,5,31,53,110,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255
733136,46095,20150,8,18,10,0,345,0,84,0.042962,0.369146,0.826492,0.048861,0.640224,0.050564,0.945564,0.202006,24,36.0,2020,12,5,31,53,110,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
733137,46095,29657,8,18,10,0,345,0,84,0.042962,0.369146,0.826492,0.048861,0.640224,0.050564,0.945564,0.283543,26,24.0,2020,12,5,31,53,110,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
733138,46101,10756,1,1371,4,0,487,3,44,0.012051,0.466804,0.826492,0.024224,0.3346,0.059138,0.821443,0.106383,0,0.0,2020,12,5,31,53,110,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255
733139,46115,22987,1,1366,10,0,537,0,3,0.004046,0.369146,0.826492,0.007201,0.640224,0.023124,0.892975,0.142365,0,0.0,2020,12,5,31,53,110,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255


In [72]:
rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

[1mrow count of set:				 896426
[1mrows where label was predicted correctly:	 389679 	 (43.470 % of rows)


---

In [73]:
df_eval['weekOfYear_pred'] = (df_eval['date(weekOfYear)'] + df_eval['nextBuyIn_pred']) % 53
df_eval.tail(100)

Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(weekOfMonth),date(dayOfMonth),date(weekOfYear),date(dayOfYear),...,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred,weekOfYear_pred
896327,45685,637,2,6,4,0,131,0,144,0.022353,0.466804,0.826492,0.010412,0.640224,0.086085,0.981651,0.20802,0,0.0,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253,45
896328,45685,19515,3,1180,4,0,291,0,44,0.021251,0.466804,0.826492,0.035373,0.640224,0.059138,0.980147,0.137615,7,60.5,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,10
896329,45685,24890,2,1445,4,0,-1,-1,44,0.01468,0.466804,0.826492,0.0,0.0,0.059138,0.648024,0.180798,4,60.5,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,10
896330,45685,31073,1,1126,4,0,291,3,129,0.01191,0.466804,0.826492,0.035373,0.3346,0.002568,0.79926,0.125326,3,60.5,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,10
896331,45687,2446,1,539,10,0,4,3,84,0.028242,0.369146,0.826492,0.003698,0.3346,0.050564,0.767612,0.158107,0,0.0,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253,45
896332,45687,16972,1,325,6,0,166,0,122,0.011482,0.152436,0.826492,0.003433,0.640224,0.028051,0.791651,0.128259,0,0.0,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253,45
896333,45699,3371,2,618,4,0,399,3,140,0.018579,0.466804,0.826492,0.006762,0.3346,0.005138,0.789829,0.0,0,0.0,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,254,46
896334,45699,5349,1,225,10,0,395,0,127,0.008169,0.369146,0.826492,0.01145,0.640224,0.006108,0.888767,0.156842,0,0.0,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,254,46
896335,45699,6748,1,393,10,0,492,3,121,0.004156,0.369146,0.826492,0.001039,0.3346,0.003068,0.731466,0.220339,8,34.2,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,6
896336,45699,22171,1,1445,3,1,-1,-1,-1,0.01468,0.0054,0.100607,0.0,0.0,0.0,0.041183,0.0,0,0.0,2021,1,4,31,4,31,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253,45


In [90]:
df_final = pd.DataFrame()
df_final['userID'] = df_eval['userID']
df_final['itemID'] = df_eval['itemID']
df_final['year'] = df_eval['date(year)']
df_final['month'] = df_eval['date(month)']
df_final['day'] = df_eval['date(dayOfMonth)']
df_final['weekOfYear'] = df_eval['date(weekOfYear)']
df_final['nextBuyIn_pred'] = df_eval['nextBuyIn_pred']
df_final['weekOfYear_pred'] = df_eval['weekOfYear_pred']
df_final['meanDiffWeeks'] = df_eval['MeanDiffToNxt(user)'].apply(lambda x: round(x/7))

In [91]:
df_final.loc[df_final.userID == 0]

Unnamed: 0,userID,itemID,year,month,day,weekOfYear,nextBuyIn_pred,weekOfYear_pred,meanDiffWeeks
199516,0,12468,2020,8,3,32,0,32,0
199517,0,15083,2020,8,3,32,0,32,0
249706,0,12505,2020,8,18,34,0,34,0
294628,0,1505,2020,9,1,36,0,36,0
411327,0,26387,2020,10,9,41,0,41,0
551150,0,9325,2020,11,20,47,255,37,0
615848,0,31683,2020,12,4,49,255,39,0
615849,0,31923,2020,12,4,49,255,39,0
648664,0,20664,2020,12,11,50,12,9,14
802369,0,6446,2021,1,15,2,2,4,5


In [92]:
subm_path = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\submission.csv'
df_submission = pd.read_csv(subm_path, sep='|')
df_submission

Unnamed: 0,userID,itemID,prediction
0,0,20664,
1,0,28231,
2,13,2690,
3,15,1299,
4,15,20968,
5,20,8272,
6,24,11340,
7,34,21146,
8,34,31244,
9,46,31083,


In [93]:
df_submission = df_submission.merge(df_final, how='left', on=['userID', 'itemID'])
df_submission.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   userID           10000 non-null  int64  
 1   itemID           10000 non-null  int64  
 2   prediction       0 non-null      float64
 3   year             10000 non-null  int64  
 4   month            10000 non-null  int64  
 5   day              10000 non-null  int64  
 6   weekOfYear       10000 non-null  int64  
 7   nextBuyIn_pred   10000 non-null  uint8  
 8   weekOfYear_pred  10000 non-null  int64  
 9   meanDiffWeeks    10000 non-null  int64  
dtypes: float64(1), int64(8), uint8(1)
memory usage: 791.0 KB


In [94]:
df_submission

Unnamed: 0,userID,itemID,prediction,year,month,day,weekOfYear,nextBuyIn_pred,weekOfYear_pred,meanDiffWeeks
0,0,20664,,2020,12,11,50,12,9,14
1,0,28231,,2021,1,25,4,2,6,5
2,13,2690,,2020,12,24,52,8,7,10
3,15,1299,,2021,1,14,2,3,5,6
4,15,20968,,2021,1,25,4,2,6,5
5,20,8272,,2020,10,27,44,8,52,8
6,24,11340,,2020,12,27,52,10,9,11
7,34,21146,,2020,11,13,46,7,0,8
8,34,31244,,2021,1,13,2,8,10,11
9,46,31083,,2021,1,6,1,9,10,12


In [95]:
def getFebWeek(weekOfYear):
    w = weekOfYear
    if w == 5:
        return 1
    elif w == 6:
        return 2
    elif w == 7:
        return 3
    elif w == 8:
        return 4
    else:
        return 0

In [96]:
df_submission['prediction'] = df_submission['weekOfYear_pred'].apply(getFebWeek)
df_submission

Unnamed: 0,userID,itemID,prediction,year,month,day,weekOfYear,nextBuyIn_pred,weekOfYear_pred,meanDiffWeeks
0,0,20664,0,2020,12,11,50,12,9,14
1,0,28231,2,2021,1,25,4,2,6,5
2,13,2690,3,2020,12,24,52,8,7,10
3,15,1299,1,2021,1,14,2,3,5,6
4,15,20968,2,2021,1,25,4,2,6,5
5,20,8272,0,2020,10,27,44,8,52,8
6,24,11340,0,2020,12,27,52,10,9,11
7,34,21146,0,2020,11,13,46,7,0,8
8,34,31244,0,2021,1,13,2,8,10,11
9,46,31083,0,2021,1,6,1,9,10,12


In [97]:
path = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220625_submission_01.csv'
df_submission.to_csv(path, index=False, sep='|')

In [88]:
len(df_submission.loc[df_submission['prediction'] != 0])

2888

In [78]:
duplicateRows = df_submission[df_submission.duplicated(['userID', 'itemID'])]

In [79]:
duplicateRows

Unnamed: 0,userID,itemID,prediction,year,month,day,weekOfYear,nextBuyIn_pred,weekOfYear_pred
