### Methods & Settings

In [1]:
from IPython.display import HTML
from IPython.display import display

# Taken from https://stackoverflow.com/questions/31517194/how-to-hide-one-specific-cell-input-or-output-in-ipython-notebook
tag = HTML('''<script>
code_show=false; 
function code_toggle() {
    if (code_show){
        $('div.cell.code_cell.rendered.selected div.input').hide();
    } else {
        $('div.cell.code_cell.rendered.selected div.input').show();
    }
    code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<a href="javascript:code_toggle()">HIDE/SHOW CONTENT</a>.''')
display(tag)

############### Write code below ##################

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import gc
import joblib
import math

import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression

#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:75% !important; }</style>"))

pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 25)
pd.set_option('display.max_columns', 50)

####
# prints memory usage
def show_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB\n'.format(start_mem))
    return

####
# seperates features from label (y must be last column)
def sep_X_y(df):
    X = df.iloc[:,0:-1] # extracts all rows [:] and columns from 0 to next-to-last [0:-1]
    y = df.iloc[:,-1] # extracts all rows [:] and only last column [-1]
    
    return [X, y]

####
# split training and test set from given dataframe with month as boundaries
def mth_train_test_split(df, mth_start, mth_end):
    print('Splitting dataframe...\n')
    
    # get indices from desired boundaries
    idx_start = df.month.searchsorted(mth_start_train, side='left') # list needs to be sorted already for searchsorted
    idx_end = df.month.searchsorted(mth_end_train + 1, side='left')
    
    df = df.iloc[idx_start:idx_end]
    
    return df

####
# trains XGB model (regressor)
def train_xgb(X, y):
    
    print('Fitting model...\n')
    model = XGBRegressor(tree_method='gpu_hist', gpu_id=0)
    fitted_model = model.fit(X, y)
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    xgb.plot_importance(model, importance_type='gain', max_num_features=25)
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# trains LinearRegression model
def train_linReg(X, y):
    
    print('Fitting model...')
    model = LinearRegression()
    fitted_model = model.fit(X, y)
    print('Done!')
    
    #print('Plotting feature importance for "gain". Do not rely on that.\n')
#    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
#     xgb.plot_importance(model, importance_type='gain', max_num_features=25)
#     plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# trains XGB model (regressor)
def train_lgbm(X, y):
    
    print('Fitting model...\n')
    model = LGBMRegressor(boosting_type='gbdt', device="gpu")
    fitted_model = model.fit(X, y)
    
    print('Plotting feature importance for "gain".')
    print('https://towardsdatascience.com/interpretable-machine-learning-with-xgboost-9ec80d148d27\n')
    lgb.plot_importance(model, importance_type='gain', max_num_features=25)
    plt.show()
    
    # GRAPHVIZ (software + pip package) needed for tree plotting
    #fig, ax = plt.subplots(figsize=(30, 30))
    #xgb.plot_tree(model, num_trees=0, ax=ax, rankdir='LR')
    #plt.show()
    
    return fitted_model

####
# predicts labels of training and test with given model
def predict_values(model, X, y_true):
    print('Predicting values...')
    # predict y values
    y_pred = model.predict(X)
    print('Done!\n')
    
    # get msq
    model_error = mean_squared_error(y_true, y_pred)
    
    # print info about accuracies
    print(f'\t\t\t\t\t\033[1m XGboost Regressor MSE: '
          f' {model_error:.3f}')
    
    print(f'\t\t\t\t\t\033[1m XGboost Regressor RMSE: '
          f' {math.sqrt(model_error):.3f}')
    
    # return predicted values
    return y_pred

####
# concatenates prediction with actual target for evaluation
def concat_ytrue_ypred(X, y_true, y_pred):
    # create dataframe from test-prediction with index from X_test
    df_y_pred = pd.DataFrame(y_pred, columns=['nextBuyIn_pred'], index=X.index, dtype=np.int8)

    # concatenate X, y, y_pred (put columns next to each other)
    df_eval = pd.concat([X, y_true, df_y_pred], axis=1)
    
    return df_eval

####
# executes all needed functions of the above with given training and test data and provided train method
# def execute_pipeline(train_method, df, start_mth, end_mth):
#     b = list_of_four_df_boundaries
#     # split dataframe in train/test and X/y
#     X_train, y_train, X_test, y_test = dt_train_test_split(df, b[0], b[1], b[2], b[3])
    
#     #train model
#     model = train_method(X_train, y_train)    
    
#     # make predictions
#     pred_train, pred_test = predict_values(model, X_train, y_train, X_test, y_test)
    
#     print('\nExecuted pipeline.\nEvaluate with "evaluate_pred(X, y, y_pred)"\n')
#     return [pred_train, pred_test, X_train, y_train, X_test, y_test]

ModuleNotFoundError: No module named 'lightgbm'

# <font color='purple'>Predicting Weeks w/o normalization + categories multihot (Train/Test)</color>


In [18]:
train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\17-1_220624_4TimeRepurchaser_train.csv'
test = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\17-2_220624_4TimeRepurchaser_test.csv'

columns = [#'date',
           'userID', 
           'itemID',
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories',
           'brandOrderRatio',
           'feature1OrderRatio',
           'feature2OrderRatio',
           'feature3OrderRatio',
           'feature4OrderRatio',
           'feature5OrderRatio',
           'TotalBFscore',
           'RCP',
           'MeanDiffToNxt(user)',
           'TotalItemOrders(user)',
           #'TotalItemOrders(item)',
           'date(year)',
           'date(month)',
           #'date(weekOfMonth)',
           'date(dayOfMonth)',
           'date(weekOfYear)',
           'date(dayOfYear)',
           #'nextBuyInWeeks(round)', # label
           'nextBuyInWeeks(floor)', # label
           #'nextBuyInWeekOfYear' # label; schlechte idee
          ]

dtype = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16,
         'nextBuyInWeeks(floor)':np.uint8
        }

label = 'nextBuyInWeeks(floor)'

df_train = pd.read_csv(train, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

df_test = pd.read_csv(test, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

# add fake column for ensuring all categories from 0 to 4299 are included
df_train.loc[len(df_train)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_train.columns]
df_train.index = df_train.index + 1  # add index

df_test.loc[len(df_test)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_test.columns]
df_test.index = df_test.index + 1  # add index

df = df_train

In [19]:
## Preparation

In [20]:
# multi-hot-encode categories
cats = df["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df.drop(index=df.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df = df.join(df_multi_hot, how='inner')

if (len(df[df.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df.pop(label)
df.insert(len(df.columns), col.name, col)

del df_multi_hot
gc.collect()

#df

0

In [21]:
# save column names
column_headers = list(df.columns)

# split DF in X & y
X_train, y_train = sep_X_y(df)
#X_train

In [22]:
## Training & Prediction

In [23]:
## Linear Regression

Pipeline needs training method, dataframe and dates to split dataframe in training and test set.

In [24]:
model = train_linReg(X_train, y_train)
#model = train_dtc(X_train, y_train)

y_pred = predict_values(model, X_train, y_train)

Fitting model...
Done!
Predicting values...
Done!

					[1m XGboost Regressor MSE:  7.520
					[1m XGboost Regressor RMSE:  2.742


In [25]:
### Evaluation

In [26]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_train.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)

# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_train, y_train, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 39736
[1mrows where label was predicted correctly:	 7420 	 (18.673 % of rows)


Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),0,...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,276,15667,1,1201,4,0,30,0,163,0.000659,0.023826,0.044684,0.000067,0.033429,0.000773,0.911912,1.000000,4,44.500000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,7
2,276,28708,1,504,10,0,441,3,84,0.000978,0.020648,0.044684,0.001161,0.016194,0.004704,0.775920,1.000000,3,37.000000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,6
3,532,7644,1,1276,6,0,45,3,48,0.000081,0.005880,0.044684,0.000031,0.016194,0.000575,0.587077,1.000000,3,52.500000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,8
4,752,22963,1,1201,10,0,43,0,147,0.000659,0.020648,0.044684,0.000451,0.033429,0.000283,0.882270,1.000000,7,47.333333,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,7
5,1123,18498,1,1401,4,0,95,0,44,0.000074,0.023826,0.044684,0.000017,0.033429,0.003489,0.930692,1.000000,2,62.000000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,9
6,1421,20664,1,408,4,0,284,0,66,0.000455,0.023826,0.044684,0.000007,0.033429,0.001987,0.920493,1.000000,2,91.000000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,11
7,1524,20703,1,745,10,0,503,0,17,0.000519,0.020648,0.044684,0.002691,0.033429,0.002932,0.925138,1.000000,2,57.000000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,9
8,1524,22341,1,386,10,0,502,0,29,0.000194,0.020648,0.044684,0.000722,0.033429,0.000135,0.879196,1.000000,3,57.500000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,8
9,1524,23688,3,420,4,1,510,0,-1,0.000007,0.023826,0.003734,0.000318,0.033429,0.000000,0.531723,1.000000,7,28.500000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,6
10,1567,29726,1,378,10,0,421,0,3,0.000396,0.020648,0.044684,0.001179,0.033429,0.001126,0.894083,1.000000,2,80.000000,2020,6,1,23,153,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,10


In [27]:
# multi-hot-encode categories
cats = df_test["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df_test.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df_test.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df_test.drop(index=df_test.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df_test = df_test.join(df_multi_hot, how='inner')

if (len(df_test[df_test.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df_test.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df_test.pop(label)
df_test.insert(len(df_test.columns), col.name, col)

del df_multi_hot
gc.collect()

#df_test

0

In [28]:
X_test, y_test = sep_X_y(df_test)

y_pred = predict_values(model, X_test, y_test)

Predicting values...
Done!

					[1m XGboost Regressor MSE:  16.030
					[1m XGboost Regressor RMSE:  4.004


In [29]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_test.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)
# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_test, y_test, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 13591
[1mrows where label was predicted correctly:	 1982 	 (14.583 % of rows)


Unnamed: 0,userID,itemID,order,brand,feature_1,feature_2,feature_3,feature_4,feature_5,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RCP,TotalItemOrders(user),MeanDiffToNxt(user),date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),0,...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,21340,16599,1,888,10,0,224,3,132,0.000575,0.027466,0.059594,0.000211,0.021632,0.001015,0.727041,1.000000,4,7.500000,2020,6,16,25,168,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,19669,19043,1,186,10,3,27,3,39,0.002310,0.027466,0.002299,0.000486,0.021632,0.000971,0.351915,1.000000,3,10.000000,2020,6,22,26,174,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,2
3,5218,6168,3,1445,10,0,-1,-1,178,0.000853,0.027466,0.059594,0.000000,0.000000,0.000148,0.574956,1.000000,7,10.000000,2020,6,26,26,178,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3
4,6042,5423,8,449,4,0,535,3,105,0.000425,0.031841,0.059594,0.000335,0.021632,0.000401,0.752364,1.000000,22,7.500000,2020,6,29,27,181,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4
5,22362,6657,1,504,10,0,441,3,84,0.001255,0.027466,0.059594,0.001475,0.021632,0.006202,0.775391,1.000000,3,14.000000,2020,6,30,27,182,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,4
6,14428,26159,1,1065,10,0,491,3,147,0.000587,0.027466,0.059594,0.002712,0.021632,0.000371,0.739716,1.000000,4,13.500000,2020,7,1,27,183,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,23,3
7,33697,1782,3,6,4,3,321,0,144,0.001541,0.031841,0.002299,0.000606,0.044612,0.005556,0.564080,1.000000,9,13.500000,2020,7,5,27,187,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,4
8,24338,1932,3,186,4,0,319,0,144,0.002310,0.031841,0.059594,0.000246,0.044612,0.005556,0.955303,1.000000,8,15.000000,2020,7,6,28,188,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,4
9,44183,18804,2,1045,6,0,525,0,-1,0.000053,0.007963,0.059594,0.000131,0.044612,0.000000,0.739662,1.000000,7,13.500000,2020,7,7,28,189,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,4
10,26563,6524,3,961,6,0,436,3,117,0.000061,0.007963,0.059594,0.000055,0.021632,0.001072,0.590650,1.000000,15,12.333333,2020,7,8,28,190,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3


In [None]:
---

In [11]:
# Mean Addition to last purchase

In [None]:
test = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\17-2_220624_4TimeRepurchaser_test.csv'

columns = ['date',
           'userID', 
           'itemID',
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories',
           'brandOrderRatio',
           'feature1OrderRatio',
           'feature2OrderRatio',
           'feature3OrderRatio',
           'feature4OrderRatio',
           'feature5OrderRatio',
           'TotalBFscore',
           'RCP',
           'MeanDiffToNxt(user)',
           'TotalItemOrders(user)',
           #'TotalItemOrders(item)',
           'date(year)',
           'date(month)',
           #'date(weekOfMonth)',
           'date(dayOfMonth)',
           'date(weekOfYear)',
           'date(dayOfYear)',
           #'nextBuyInWeeks(round)', # label
           'nextBuyInWeeks(floor)', # label
           #'nextBuyInWeekOfYear' # label; schlechte idee
          ]

dtype = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint16,
         'nextBuyInWeeks(floor)':np.uint8
        }

label = 'nextBuyInWeeks(floor)'

In [None]:
## Preparation

In [None]:
df_test = pd.read_csv(test, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})


#df_test

In [None]:
df_test['meanPred'] = (df_test['date(weekOfYear)'] + round(df_test['MeanDiffToNxt(user)']/7)).astype(np.uint16)
df_test['y_true'] = df_test['date(weekOfYear)'] + df_test['nextBuyInWeeks(floor)']

rowcount = len(df_test)
should = rowcount
is_ = len(df_test.loc[(df_test['meanPred'] == df_test.y_true)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_test

---

# <font color='red'>PREDICTING FOR SUBMISSION // LinearRegression</color>


In [3]:
train = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220628_train_ohneLetzte.csv'
predset = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\220628_test_nurletzte.csv'

columns = [#'date',
           'userID', 
           'itemID',
           'order', 
           'brand', 
           'feature_1', 
           'feature_2', 
           'feature_3', 
           'feature_4', 
           'feature_5',
           'categories',
           'brandOrderRatio',
           'feature1OrderRatio',
           'feature2OrderRatio',
           'feature3OrderRatio',
           'feature4OrderRatio',
           'feature5OrderRatio',
           'TotalBFscore',
           'RepeatCustomerProbability',
           'MeanDiffToNxt(user)',
           'TotalItemOrders(user)',
           #'TotalItemOrders(item)',
           'date(year)',
           'date(month)',
           #'date(weekOfMonth)',
           'date(dayOfMonth)',
           'date(weekOfYear)',
           'date(dayOfYear)',
           #'nextBuyInWeeks(round)', # label
           'nextBuyInWeeks(floor)', # label
           #'nextBuyInWeekOfYear' # label; schlechte idee
          ]

dtype = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8,
         'nextBuyInWeeks(floor)':np.uint8
        }

label = 'nextBuyInWeeks(floor)'

In [None]:
## Preparation

In [4]:
df_train = pd.read_csv(train, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

df_test = pd.read_csv(predset, sep='|', usecols=columns, dtype=dtype, nrows=None, converters={
    'categories': lambda x: [int(i) for i in x[1:-1].split(',')]
})

# add fake column for ensuring all categories from 0 to 4299 are included
df_train.loc[len(df_train)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_train.columns]
df_train.index = df_train.index + 1  # add index

df_test.loc[len(df_test)] = [0 if column != 'categories' else [cat for cat in range(0,4300)] for column in df_test.columns]
df_test.index = df_test.index + 1  # add index

df = df_train
show_mem_usage(df)

Memory usage of dataframe is 36.07 MB



In [5]:
df

Unnamed: 0,userID,itemID,order,date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),brand,feature_1,feature_2,feature_3,feature_4,feature_5,TotalItemOrders(user),categories,brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RepeatCustomerProbability,MeanDiffToNxt(user),nextBuyInWeeks(floor)
1,76,23050,1,2020,6,1,23,153,1411,4,0,22,0,151,1,"[545, 1763, 3912, 3300, 3586, 3914, 3915, 3962...",0.001921,0.091557,0.165390,0.002558,0.124524,0.004778,0.939746,0.379661,0.000000,24
2,116,9408,1,2020,6,1,23,153,322,4,0,536,0,144,2,"[1394, 2435]",0.001328,0.091557,0.165390,0.004916,0.124524,0.017759,0.975647,1.000000,160.000000,22
3,116,25677,1,2020,6,1,23,153,322,4,0,536,0,144,2,"[1922, 3393]",0.001328,0.091557,0.165390,0.004916,0.124524,0.017759,0.975647,0.379310,160.000000,22
4,135,13660,1,2020,6,1,23,153,157,4,0,513,0,137,1,"[74, 277, 3953]",0.001156,0.091557,0.165390,0.000381,0.124524,0.000585,0.922372,0.000000,0.000000,4
5,135,22174,1,2020,6,1,23,153,504,10,0,441,3,84,2,"[2591, 2312, 2708]",0.002035,0.075193,0.165390,0.002209,0.063110,0.014036,0.772352,0.800000,32.000000,4
6,202,26940,1,2020,6,1,23,153,1258,4,0,487,3,44,1,"[2402, 189, 170, 1066, 3914]",0.000195,0.091557,0.165390,0.005268,0.063110,0.014088,0.815283,0.121951,0.000000,11
7,240,7318,1,2020,6,1,23,153,1335,6,0,421,3,6,1,"[3224, 701]",0.000231,0.023561,0.165390,0.005136,0.063110,0.000559,0.616562,0.461538,0.000000,10
8,240,26645,1,2020,6,1,23,153,648,10,0,358,3,24,1,"[1244, 1763, 3867, 3176, 2443, 2]",0.000203,0.075193,0.165390,0.001669,0.063110,0.001131,0.735157,0.265060,0.000000,10
9,244,10341,1,2020,6,1,23,153,1025,6,0,198,0,17,1,"[2995, 3654, 1763, 3970, 3934]",0.000427,0.023561,0.165390,0.000199,0.124524,0.014102,0.787518,0.303797,0.000000,15
10,276,15667,1,2020,6,1,23,153,1201,4,0,30,0,163,6,"[1680, 813, 218, 3915, 3914, 4069]",0.002355,0.091557,0.165390,0.000229,0.124524,0.003325,0.931592,0.318182,39.333333,8


In [6]:
# multi-hot-encode categories
cats = df["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df.drop(index=df.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df = df.join(df_multi_hot, how='inner')

if (len(df[df.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df.pop(label)
df.insert(len(df.columns), col.name, col)

del df_multi_hot
gc.collect()

#df

0

In [7]:
# save column names
column_headers = list(df.columns)

# split DF in X & y
X_train, y_train = sep_X_y(df)
#X_train

In [None]:
## Training & Prediction

In [None]:
### Linear Regression

In [8]:
model = train_linReg(X_train, y_train)
#model = train_dtc(X_train, y_train)

y_pred = predict_values(model, X_train, y_train)

Fitting model...
Done!
Predicting values...
Done!

					[1m XGboost Regressor MSE:  31.784
					[1m XGboost Regressor RMSE:  5.638


In [None]:
### Evaluation

In [9]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_train.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)


# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_train, y_train, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted correctly:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 175109
[1mrows where label was predicted correctly:	 13491 	 (7.704 % of rows)


Unnamed: 0,userID,itemID,order,date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),brand,feature_1,feature_2,feature_3,feature_4,feature_5,TotalItemOrders(user),brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RepeatCustomerProbability,MeanDiffToNxt(user),0,...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,76,23050,1,2020,6,1,23,153,1411,4,0,22,0,151,1,0.001921,0.091557,0.165390,0.002558,0.124524,0.004778,0.939746,0.379661,0.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24,14
2,116,9408,1,2020,6,1,23,153,322,4,0,536,0,144,2,0.001328,0.091557,0.165390,0.004916,0.124524,0.017759,0.975647,1.000000,160.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,13
3,116,25677,1,2020,6,1,23,153,322,4,0,536,0,144,2,0.001328,0.091557,0.165390,0.004916,0.124524,0.017759,0.975647,0.379310,160.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,14
4,135,13660,1,2020,6,1,23,153,157,4,0,513,0,137,1,0.001156,0.091557,0.165390,0.000381,0.124524,0.000585,0.922372,0.000000,0.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,15
5,135,22174,1,2020,6,1,23,153,504,10,0,441,3,84,2,0.002035,0.075193,0.165390,0.002209,0.063110,0.014036,0.772352,0.800000,32.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,10
6,202,26940,1,2020,6,1,23,153,1258,4,0,487,3,44,1,0.000195,0.091557,0.165390,0.005268,0.063110,0.014088,0.815283,0.121951,0.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,16
7,240,7318,1,2020,6,1,23,153,1335,6,0,421,3,6,1,0.000231,0.023561,0.165390,0.005136,0.063110,0.000559,0.616562,0.461538,0.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,16
8,240,26645,1,2020,6,1,23,153,648,10,0,358,3,24,1,0.000203,0.075193,0.165390,0.001669,0.063110,0.001131,0.735157,0.265060,0.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10,13
9,244,10341,1,2020,6,1,23,153,1025,6,0,198,0,17,1,0.000427,0.023561,0.165390,0.000199,0.124524,0.014102,0.787518,0.303797,0.000000,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,17
10,276,15667,1,2020,6,1,23,153,1201,4,0,30,0,163,6,0.002355,0.091557,0.165390,0.000229,0.124524,0.003325,0.931592,0.318182,39.333333,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,12


In [None]:
## !! Prediction on Predictionset

In [10]:
# multi-hot-encode categories
cats = df_test["categories"]
mlb = MultiLabelBinarizer(sparse_output=False) # Set to True if output binary array is desired in CSR sparse format
df_multi_hot = pd.DataFrame(mlb.fit_transform(cats), columns=mlb.classes_, index=df_test.index, dtype=np.int8).astype(pd.SparseDtype(np.uint8,0)) # NaN filled with 0

# drop fake rows from both dataframes (last row) & drop category '9999' standing for missing category
df_multi_hot.drop(index=df_test.index[-1], axis=0, inplace=True)
df_multi_hot = df_multi_hot.iloc[:,:-1]
df_test.drop(index=df_test.index[-1], axis=0, inplace=True)

# join new binarized columns with rest of dataframe
df_test = df_test.join(df_multi_hot, how='inner')

if (len(df_test[df_test.isnull().any(axis=1)]) > 0):
    raise RuntimeError('Join of multi-hot-encoded categories probably created missing values.')

# drop list of categories, since it's not needed anymore
df_test.drop('categories', axis=1, inplace=True)

# pop and append 'week' at end of dataframe
col = df_test.pop(label)
df_test.insert(len(df_test.columns), col.name, col)

del df_multi_hot
gc.collect()

#df_test

0

In [11]:
X_test, y_test = sep_X_y(df_test)
gc.collect()
y_pred = model.predict(X_test)

In [14]:
dtype_X = {'userID':np.uint16,
         'itemID':np.uint16,
         'order':np.uint8,
         'brand':np.int16,
         'feature_1':np.int8,
         'feature_2':np.uint8,
         'feature_3':np.int16,
         'feature_4':np.int8,
         'feature_5':np.int16,
         'TotalItemOrders(user)':np.uint16,
         'date(year)':np.uint16,
         'date(month)':np.uint8,
         #'date(weekOfMonth)':np.uint8,
         'date(dayOfMonth)':np.uint8,
         'date(weekOfYear)':np.uint8,
         'date(dayOfYear)':np.uint8
        }
dtype_y = {'nextBuyInWeeks(floor)':np.uint8}

y_pred = pd.DataFrame(y_pred, index=y_test.index).apply(lambda x: round(x)).astype(np.uint8)

y_pred.set_axis(['nextBuyIn_pred'], axis=1,inplace=True)

# concatenate X, y, y_pred (columns next to each other)
df_eval = pd.concat([X_test, y_test, y_pred], axis=1)

rowcount = len(df_eval)
should = rowcount
is_ = len(df_eval.loc[(df_eval['nextBuyInWeeks(floor)'] == df_eval.nextBuyIn_pred)]) 

print(f'\033[1mrow count of set:\t\t\t\t {rowcount}')
print(f'\033[1mrows where label was predicted 0:\t {is_} \t ({is_/should*100:.3f} % of rows)')

df_eval

[1mrow count of set:				 896426
[1mrows where label was predicted 0:	 3118 	 (0.348 % of rows)


Unnamed: 0,userID,itemID,order,date(year),date(month),date(dayOfMonth),date(weekOfYear),date(dayOfYear),brand,feature_1,feature_2,feature_3,feature_4,feature_5,TotalItemOrders(user),brandOrderRatio,feature1OrderRatio,feature2OrderRatio,feature3OrderRatio,feature4OrderRatio,feature5OrderRatio,TotalBFscore,RepeatCustomerProbability,MeanDiffToNxt(user),0,...,4277,4278,4279,4280,4281,4282,4283,4284,4285,4286,4287,4288,4289,4290,4291,4292,4293,4294,4295,4296,4297,4298,4299,nextBuyInWeeks(floor),nextBuyIn_pred
1,38769,3477,1,2020,6,1,23,153,186,6,0,196,0,45,1,0.042279,0.152436,0.826492,0.000599,0.640224,0.037693,0.809960,0.045802,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,205
2,42535,30474,1,2020,6,1,23,153,193,10,3,229,3,132,1,0.010567,0.369146,0.056881,0.000547,0.334600,0.019681,0.367741,0.011858,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,246
3,42535,15833,1,2020,6,1,23,153,1318,4,1,455,0,108,1,0.000028,0.466804,0.100607,0.002230,0.640224,0.019841,0.581138,0.000000,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36
4,42535,20131,1,2020,6,1,23,153,347,4,0,291,3,44,1,0.010043,0.466804,0.826492,0.035373,0.334600,0.059138,0.825894,0.233251,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,93
5,42535,4325,1,2020,6,1,23,153,539,6,0,303,0,45,1,0.028242,0.152436,0.826492,0.008133,0.640224,0.037693,0.806793,0.078923,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,150
6,42535,12919,1,2020,6,1,23,153,1338,10,0,26,0,39,1,0.007632,0.369146,0.826492,0.000543,0.640224,0.014561,0.887310,0.045455,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,140
7,29737,9139,1,2020,6,1,23,153,703,10,0,413,3,3,1,0.017302,0.369146,0.826492,0.000247,0.334600,0.023124,0.747246,0.142241,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,160
8,43683,18733,1,2020,6,1,23,153,1496,4,0,17,0,81,1,0.046229,0.466804,0.826492,0.004072,0.640224,0.015723,0.955932,0.076923,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,144
9,42535,15005,1,2020,6,1,23,153,361,10,0,505,0,152,1,0.013314,0.369146,0.826492,0.009239,0.640224,0.001438,0.887921,0.097561,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,231
10,43683,21343,1,2020,6,1,23,153,406,6,0,302,0,17,1,0.010606,0.152436,0.826492,0.001057,0.640224,0.078879,0.814814,0.190678,0.0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,145


In [None]:
#df_eval.loc[df_eval['date(month)'] == 12].tail()

In [None]:
### Calculating predicted weekOfYear for next purchase, then convert weekOfYear to week of Feb (1-4)

In [13]:
df_eval.drop(index=df_eval.index[-1], axis=0, inplace=True)

KeyboardInterrupt: 

In [15]:
df_eval['day'] = df_eval['date(dayOfMonth)'].astype(np.uint16)
df_eval['month'] = df_eval['date(month)'].astype(np.uint16)
df_eval['year'] = df_eval['date(year)'].astype(np.uint16)
df_eval['date'] = pd.to_datetime(df_eval[['year', 'month', 'day']])
df_eval['weekOfYear_pred'] = (df_eval['date'] + pd.to_timedelta(df_eval['nextBuyIn_pred'], unit='w')).dt.weekofyear
df_eval.drop(['date', 'year', 'month', 'day'], axis=1, inplace=True)

In [16]:
df_final = pd.DataFrame()
df_final['userID'] = df_eval['userID']
df_final['itemID'] = df_eval['itemID']
df_final['year'] = df_eval['date(year)']
df_final['month'] = df_eval['date(month)']
df_final['day'] = df_eval['date(dayOfMonth)']
df_final['weekOfYear'] = df_eval['date(weekOfYear)']
df_final['nextBuyIn_pred'] = df_eval['nextBuyIn_pred']
df_final['weekOfYear_pred'] = df_eval['weekOfYear_pred']
df_final['meanDiffWeeks'] = df_eval['MeanDiffToNxt(user)'].apply(lambda x: round(x/7))
df_final['meanDiffDays'] = df_eval['MeanDiffToNxt(user)']

In [17]:
subm_path = r'C:\Users\LEAND\Coding\Jupyter Notebooks\csv\submission.csv'
df_submission = pd.read_csv(subm_path, sep='|')
#df_submission

df_submission = df_submission.merge(df_final, how='left', on=['userID', 'itemID'])
#df_submission

In [18]:
# calculate week of February from predicted weekOfYear
def getFebWeek(weekOfYear):
    w = weekOfYear
    if w == 5:
        return 1
    elif w == 6:
        return 2
    elif w == 7:
        return 3
    elif w == 8:
        return 4
    else:
        return 0

In [19]:
df_submission['prediction'] = df_submission['weekOfYear_pred'].apply(getFebWeek)
df_submission

Unnamed: 0,userID,itemID,prediction,year,month,day,weekOfYear,nextBuyIn_pred,weekOfYear_pred,meanDiffWeeks,meanDiffDays
0,0,20664,0,2020,12,11,50,54,51,14,94.500000
1,0,28231,0,2021,1,25,4,42,46,5,33.000000
2,13,2690,0,2020,12,24,52,54,1,10,67.000000
3,15,1299,0,2021,1,14,2,76,26,6,42.333333
4,15,20968,0,2021,1,25,4,38,42,5,36.333333
5,20,8272,0,2020,10,27,44,249,32,8,59.500000
6,24,11340,0,2020,12,27,52,233,24,11,79.500000
7,34,21146,0,2020,11,13,46,57,50,8,52.666667
8,34,31244,0,2021,1,13,2,238,32,11,74.000000
9,46,31083,0,2021,1,6,1,253,46,12,82.000000


In [None]:
path = r'E:\OneDrive\Arbeit\Repos\DMC2022\Kevin\csv\220626_submission_01.csv'
df_submission.to_csv(path, index=False, sep='|')

In [None]:
df_submission

In [None]:
no_zeros = len(df_submission.loc[df_submission['prediction'] != 0])
print(f'{no_zeros} rows where no 0 was predicted')

duplicateRows = df_submission[df_submission.duplicated(['userID', 'itemID'])]
print(f'{len(duplicateRows)} duplicate rows')