In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
def multiclassScore(y: pd.Series, y_pred: pd.Series,normalize :bool = False) -> float:
    if (y.astype(int).apply(lambda x: x in [0,1,2,3,4]).all() == False) or (y_pred.astype(int).apply(lambda x: x in [0,1,2,3,4]).all() == False) :
        raise ValueError('y and y_pred are only allowed to contain the elements 0,1,2,3,4')


    behaviourY = np.where(y>0,1,0) #1 is buying
    behaviourPredY = np.where(y_pred>0,1,0)
    correctBehaviour = np.sum(behaviourY == behaviourPredY)

    correctPredictionsWeek = np.sum((y == y_pred) & (y > 0))

    score = (correctBehaviour-correctPredictionsWeek)*1+correctPredictionsWeek*3

    if normalize == True:
        maxScore = np.sum((y>0)*3 + (y==0)*1)
        return(score/maxScore)
    else:
        return(score)


# Import datasets

In [7]:
train_set = pd.read_csv('train_all_features_multiclass_heuristics_updated.csv', sep=';', decimal=',')
test_set = pd.read_csv('test_all_features_multiclass_heuristics_updated.csv', sep = ";", decimal = ",")
train_set

Unnamed: 0,userID,itemID,prediction,last14d,cumsum,similar.0COrderSumIncl,similar.0COrderCountIncl,similar.0COrderSumExcl,similar.0COrderCountExcl,avg_time_orders,...,IT_PCA_comp2,IT_PCA_comp3,IT_PCA_comp4,IT_PCA_comp5,IT_PCA_NA,UI_againmean2,UI_distmean2,Unnamed..0,useractivity,itemincidence
0,45301,461,2,0,3,19,15,16,12,83.5,...,-0.181485,-0.004367,-0.074567,-0.004844,False,0,8,29195,0.363636,0.783784
1,5898,461,0,0,2,15,12,13,10,102.0,...,-0.181485,-0.004367,-0.074567,-0.004844,False,0,7,3844,0.375000,0.783784
2,7168,461,0,0,4,18,14,14,12,147.0,...,-0.181485,-0.004367,-0.074567,-0.004844,False,0,12,4622,0.333333,0.783784
3,3757,461,0,0,2,42,33,40,31,71.0,...,-0.181485,-0.004367,-0.074567,-0.004844,False,1,0,2475,0.454545,0.783784
4,41695,461,2,0,2,31,14,29,12,62.0,...,-0.181485,-0.004367,-0.074567,-0.004844,False,0,6,26897,0.444444,0.783784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56707,44010,13756,0,0,2,4,4,2,2,2.0,...,0.242416,-0.218993,-0.469199,0.016486,False,0,-3,28367,1.000000,1.000000
56708,23835,20278,0,0,2,2,2,0,0,17.0,...,0.165381,-0.140319,0.036076,0.151156,False,0,-4,15363,0.375000,0.200000
56709,15195,7751,0,0,6,6,3,0,0,55.0,...,-0.529354,0.270354,-0.051241,-0.026983,False,2,0,9802,0.428571,0.500000
56710,19832,10535,0,0,2,9,9,7,7,79.0,...,-0.302991,0.203427,0.079796,0.029775,False,0,-1,12802,0.363636,0.333333


In [8]:
train = train_set[['userID', 'itemID', 'prediction']].copy()
test = test_set[['userID', 'itemID', 'prediction']].copy()

In [9]:
u_features = pd.read_csv('U_FEAT_till_3_1.csv', index_col=0)
i_features = pd.read_csv('item_features_TIMO.csv', index_col=0, sep=',')
ui_train_features = pd.read_csv('train_allFeatures.csv', index_col=0, sep='|')
ui_test_features = pd.read_csv('test_allFeatures.csv', index_col=0, sep='|')
i_features

Unnamed: 0,IT_performance_change_whole_branch_cb,itemID,IT_performance_change_whole_branch_fb,IT_performance_in_branch_fb_change,IT_performance_in_branch_cb_change,IT_performance_in_branch_fb,IT_performance_in_branch_cb,IT_rebought_rate_ft1,IT_rebought_rate_ft2,IT_rebought_rate_ft3,...,IT_boughts_total,IT_boughts_last6w,IT_prct_last6w,IT_not_avaible,IT_PCA_comp1,IT_PCA_comp2,IT_PCA_comp3,IT_PCA_comp4,IT_PCA_comp5,IT_PCA_NA
0,0.0,1,0.00463678516228748,-0.020663072006324057,-1.0,0.0,0.0,0.285652,0.287673,0.277664,...,8,0.0,0.000000,0,-0.354151,-0.418397,0.025024,0.344274,-0.172784,False
1,0.05454545454545454,6,-0.07226107226107226,0.026129962020457018,0.0747764799996038,0.09893312516263336,0.5897435897435898,0.291240,0.287673,0.400000,...,86,14.0,0.162791,0,0.358173,-0.587056,-0.192602,-0.141174,-0.087613,False
2,0.07191780821917808,8,0.037037037037037035,0.10754221087554419,0.014345587108794071,0.23737373737373738,0.04337393872277593,0.285652,0.287673,0.352814,...,16,5.0,0.312500,0,0.054571,-0.701858,-0.024435,-0.199242,-0.101711,False
3,-0.10679611650485436,9,-0.13513513513513514,0.0,-0.03339757266082727,1.0,0.3576923076923077,0.285652,0.269808,0.262343,...,96,16.0,0.166667,0,0.218399,0.690918,0.695077,-0.169999,-0.176093,False
4,0.42857142857142855,11,0.2,-1.0,-1.0,0.0,0.0,0.291240,0.269808,0.230769,...,2,1.0,0.500000,0,-0.144407,-0.635417,0.070115,-0.101826,-0.081411,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12297,mean,32764,mean,mean,mean,mean,mean,0.285652,0.255614,0.166667,...,8,0.0,0.000000,1,0.893846,0.072595,0.019580,0.166872,0.026330,False
12298,0.0036377560788818687,32766,-0.3333333333333333,-0.6428571428571429,-0.0020300012604868643,0.0,0.0,0.285652,0.269808,0.288660,...,8,1.0,0.125000,0,0.419114,0.601917,0.010060,-0.055097,0.149495,False
12299,-0.060291060291060294,32768,-0.3617021276595745,0.2790816326530613,0.006854993082102842,0.7833333333333333,0.048420698924731186,0.291240,0.287673,0.343590,...,48,10.0,0.208333,0,-0.430959,-0.644820,-0.260607,-0.197230,0.065124,False
12300,-0.13043478260869565,32769,-0.08847184986595175,0.03511235779319133,0.0,0.12043516800659657,1.0,0.260256,0.287673,0.267231,...,39,8.0,0.205128,0,-0.507310,0.693251,-0.116650,-0.114171,0.101169,False


In [10]:
ui_features = pd.concat([ui_train_features, ui_test_features], axis=0)
ui_features

Unnamed: 0,userID,itemID,cumsum,last7d,last14d,last21d,last28d,similar>0COrderSumIncl,similar>0COrderCountIncl,similar>0COrderSumExcl,similar>0COrderCountExcl,sameBrandOrderSumIncl,sameBrandOrderCountIncl,sameBrandOrderSumExcl,sameBrandOrderCountExcl,order,avg_time_orders,day_first_purchase,day_last_purchase,usage_time_items
0,1199,637,5,0.0,0.0,0.0,0.0,5,2,0,0,6,3,1,1,3,126.000000,14,140,42.000000
1,21557,3520,2,0.0,0.0,0.0,0.0,3,3,1,1,3,3,1,1,1,83.000000,72,155,83.000000
2,23387,1633,2,0.0,0.0,0.0,0.0,2,2,0,0,2,2,0,0,1,124.000000,31,155,124.000000
3,32069,29311,5,0.0,0.0,0.0,2.0,9,7,4,4,5,3,0,0,2,28.000000,135,191,21.250000
4,30359,11124,2,0.0,0.0,0.0,0.0,11,11,9,9,2,2,0,0,1,10.000000,119,129,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27529,2176,9773,2,0.0,0.0,0.0,0.0,5,5,3,3,4,4,2,2,1,42.000000,138,180,42.000000
27530,20041,19824,4,0.0,0.0,0.0,0.0,4,4,0,0,4,4,0,0,1,53.333333,25,185,53.333333
27531,39920,18714,4,0.0,0.0,0.0,0.0,4,2,0,0,4,2,0,0,2,159.000000,23,182,79.500000
27532,5127,1299,3,1.0,1.0,1.0,1.0,6,6,3,3,5,5,2,2,1,47.500000,118,213,47.500000


## Selecting only some important features

In [11]:
#item_feat = i_features[['itemID','IT_boughts_total','IT_boughts_last6w', 'IT_avg_time_btw_boughts']].copy()
ui_feat = ui_features.drop(columns = ['last7d', 'last14d', 'last21d', 'last28d', 'order'])
u_feat = u_features.drop(columns = ['u_EF'])
ui_feat

Unnamed: 0,userID,itemID,cumsum,similar>0COrderSumIncl,similar>0COrderCountIncl,similar>0COrderSumExcl,similar>0COrderCountExcl,sameBrandOrderSumIncl,sameBrandOrderCountIncl,sameBrandOrderSumExcl,sameBrandOrderCountExcl,avg_time_orders,day_first_purchase,day_last_purchase,usage_time_items
0,1199,637,5,5,2,0,0,6,3,1,1,126.000000,14,140,42.000000
1,21557,3520,2,3,3,1,1,3,3,1,1,83.000000,72,155,83.000000
2,23387,1633,2,2,2,0,0,2,2,0,0,124.000000,31,155,124.000000
3,32069,29311,5,9,7,4,4,5,3,0,0,28.000000,135,191,21.250000
4,30359,11124,2,11,11,9,9,2,2,0,0,10.000000,119,129,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27529,2176,9773,2,5,5,3,3,4,4,2,2,42.000000,138,180,42.000000
27530,20041,19824,4,4,4,0,0,4,4,0,0,53.333333,25,185,53.333333
27531,39920,18714,4,4,2,0,0,4,2,0,0,159.000000,23,182,79.500000
27532,5127,1299,3,6,6,3,3,5,5,2,2,47.500000,118,213,47.500000


In [12]:
X_train = (train.merge(ui_feat, how='left', on=['userID', 'itemID'])).merge(u_feat, how='left', on='userID')
X_test = (test.merge(ui_feat, how='left', on=['userID', 'itemID'])).merge(u_feat, how='left', on='userID')

In [13]:
X_test

Unnamed: 0,userID,itemID,prediction,cumsum,similar>0COrderSumIncl,similar>0COrderCountIncl,similar>0COrderSumExcl,similar>0COrderCountExcl,sameBrandOrderSumIncl,sameBrandOrderCountIncl,...,day_first_purchase,day_last_purchase,usage_time_items,u_avg_orders,u_avg_period,u_first_bought,u_last_bought,u_std_avg_period,u_mean_bought,u_std_bought
0,27630,29657,0,11,61,23,50,19,20,7,...,28,160,13.833333,2.18,13.29,1,187,8.77,72.87,56.04
1,4031,29657,0,12,16,6,4,2,12,4,...,27,192,16.138889,1.71,33.00,27,192,17.29,107.33,55.49
2,33675,29657,0,5,5,3,0,0,8,6,...,78,189,27.750000,1.78,19.86,78,217,16.01,150.12,43.31
3,1579,29657,0,26,30,8,4,3,26,5,...,31,186,7.475000,2.75,34.60,31,204,13.03,131.33,58.75
4,43716,29657,0,6,27,17,21,15,6,2,...,74,151,25.666667,1.52,19.70,19,216,17.34,132.73,54.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24322,26426,2929,1,4,4,2,0,0,4,2,...,21,153,66.000000,1.29,22.00,21,153,24.41,75.00,41.29
24323,32256,2924,0,2,9,9,7,7,4,4,...,83,149,66.000000,1.30,30.00,51,201,12.95,118.67,47.72
24324,44054,31153,0,8,8,4,0,0,8,4,...,4,200,32.666667,1.62,32.67,4,200,24.71,124.43,71.61
24325,30371,8206,0,2,2,2,0,0,2,2,...,5,140,135.000000,1.79,27.86,5,200,16.73,123.50,64.19


## Random oversampling

In [14]:
amount_0 = X_train[X_train['prediction'] == 0].shape[0]
amount_1 = X_train[X_train['prediction'] == 1].shape[0]
amount_2 = X_train[X_train['prediction'] == 2].shape[0]
amount_3 = X_train[X_train['prediction'] == 3].shape[0]
amount_4 = X_train[X_train['prediction'] == 4].shape[0]

unlike_0 = amount_1 + amount_2 + amount_3 + amount_4

ratio1 = amount_1 / unlike_0
ratio2 = amount_2 / unlike_0
ratio3 = amount_3 / unlike_0
ratio4 = amount_4 / unlike_0

print(f"0: {amount_0}\t\t1: {amount_1}\t\t2: {amount_2}\t\t3: {amount_3}\t\t4: {amount_4}")

df_1_under = X_train[X_train['prediction'] == 1].sample(round(ratio1 * amount_0), replace=True)
df_2_under = X_train[X_train['prediction'] == 2].sample(round(ratio2 * amount_0), replace=True)
df_3_under = X_train[X_train['prediction'] == 3].sample(round(ratio3 * amount_0), replace=True)
df_4_under = X_train[X_train['prediction'] == 4].sample(round(ratio4 * amount_0), replace=True)

X_train_balanced_over = pd.concat([df_1_under, df_2_under, df_3_under, df_4_under, X_train[X_train['prediction'] == 0]], axis=0)

X_train_balanced_over['prediction'].value_counts()

0: 45727		1: 2609		2: 2473		3: 2836		4: 3067


0    45727
4    12767
3    11805
1    10860
2    10294
Name: prediction, dtype: int64

In [105]:
y_train = X_train_balanced_over['prediction']
y_test = X_test['prediction']
y_train

41517    1
32199    1
42768    1
23556    1
5363     1
        ..
56706    0
56707    0
56708    0
56709    0
56710    0
Name: prediction, Length: 91453, dtype: int64

## I. Random Forest Classification Model

### Training the Random Forest Classification model on the Training set

In [106]:
clf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
clf.fit(X_train_balanced_over.drop('prediction', axis=1), y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=42)

### Predicting the test set result

In [107]:
y_pred = clf.predict(X_test.drop('prediction', axis=1))

### Confusion matrix and some evaluations

In [108]:
cm_forest = confusion_matrix(y_test, y_pred)
print('Confusion matrix:')
print(cm_forest)

ac_forest = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {ac_forest}")

dmc_forest = multiclassScore(y_test, pd.Series(y_pred))
print(f"DMC score: {dmc_forest}")

Confusion matrix:
[[19277    94    67   116    93]
 [ 1013    67    22    15     9]
 [  990    11    57    19    17]
 [ 1029    19    18    74    13]
 [ 1176    18    20    28    65]]
Accuracy score: 0.8032227566078842
DMC score: 20275


### Applying GridSearchCV

In [109]:
params = {'criterion': ['gini', 'entropy'],
         'n_estimators': np.arange(100, 104),
         'min_samples_split': np.arange(5, 8),
         'max_depth': np.arange(16, 19)}
clf_over = GridSearchCV(estimator = clf, param_grid = params, error_score = 'raise', n_jobs=-1)
clf_over.fit(X_train_balanced_over.drop('prediction', axis=1), y_train)
clf_over.best_params_

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_split': 5,
 'n_estimators': 101}

In [111]:
y_pred_cv = clf_over.predict(X_test.drop('prediction', axis=1))

In [112]:
cm_cv = confusion_matrix(y_test, y_pred_cv)
print('Confusion matrix:')
print(cm_cv)

ac_cv = accuracy_score(y_test, y_pred_cv)
print(f"Accuracy score: {ac_cv}")

dmc_cv = multiclassScore(y_test, pd.Series(y_pred_cv))
print(f"DMC score: {dmc_cv}")

Confusion matrix:
[[19499    38    20    48    42]
 [ 1037    52     7    20    10]
 [ 1003     9    43    19    20]
 [ 1049    13     8    71    12]
 [ 1192    14    12    26    63]]
Accuracy score: 0.8109507954125046
DMC score: 20356


# Training model based on the background data, not heuristic data

In [17]:
train_back = pd.read_csv('train_70_backgroundTrainTestsplit.csv')
test_back = pd.read_csv('test_30_backgroundTrainTestsplit.csv')
test_back

Unnamed: 0,userID,itemID,prediction
0,12727,3742,0.0
1,25360,30429,3.0
2,4725,13378,0.0
3,29897,461,0.0
4,18611,2761,0.0
...,...,...,...
27529,31511,22721,0.0
27530,37307,18630,0.0
27531,5659,24716,0.0
27532,3539,110,0.0


In [18]:
X_train_back = train_back.merge(ui_feat, how='left', on=['userID', 'itemID']).merge(u_feat, how='left', on='userID')
X_test_back = test_back.merge(ui_feat, how='left', on=['userID', 'itemID']).merge(u_feat, how='left', on='userID')
X_train_back

Unnamed: 0,userID,itemID,prediction,cumsum,similar>0COrderSumIncl,similar>0COrderCountIncl,similar>0COrderSumExcl,similar>0COrderCountExcl,sameBrandOrderSumIncl,sameBrandOrderCountIncl,...,day_first_purchase,day_last_purchase,usage_time_items,u_avg_orders,u_avg_period,u_first_bought,u_last_bought,u_std_avg_period,u_mean_bought,u_std_bought
0,31233,21596,0.0,2,13,7,11,5,6,4,...,72,189,117.000000,1.64,36.20,8,189,24.49,102.00,73.54
1,22721,31702,0.0,6,19,13,13,11,8,4,...,37,179,47.333333,1.76,13.92,33,214,10.22,116.14,63.50
2,33799,18498,0.0,3,14,10,11,7,3,3,...,14,150,68.000000,1.26,20.89,14,202,18.16,105.10,63.30
3,35553,2394,0.0,2,2,2,0,0,2,2,...,89,194,105.000000,1.19,27.33,30,194,14.14,102.14,53.13
4,21718,10958,0.0,2,8,8,6,6,2,2,...,22,168,146.000000,1.09,18.50,22,207,12.80,117.91,60.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64238,2541,16303,1.0,2,2,2,0,0,2,2,...,78,161,83.000000,1.26,17.25,3,210,14.67,107.23,68.90
64239,18879,15389,0.0,3,3,2,0,0,3,2,...,96,181,85.000000,1.22,28.00,13,181,17.52,104.14,54.52
64240,4751,11906,0.0,5,5,5,0,0,18,18,...,22,210,47.000000,1.42,17.00,6,210,6.79,112.00,62.34
64241,36896,28827,0.0,6,7,7,1,1,6,6,...,3,195,38.400000,1.43,16.00,3,195,7.71,105.85,60.06


### Random oversampling

In [19]:
amount_0 = X_train_back[X_train_back['prediction'] == 0].shape[0]
amount_1 = X_train_back[X_train_back['prediction'] == 1].shape[0]
amount_2 = X_train_back[X_train_back['prediction'] == 2].shape[0]
amount_3 = X_train_back[X_train_back['prediction'] == 3].shape[0]
amount_4 = X_train_back[X_train_back['prediction'] == 4].shape[0]

unlike_0 = amount_1 + amount_2 + amount_3 + amount_4

ratio1 = amount_1 / unlike_0
ratio2 = amount_2 / unlike_0
ratio3 = amount_3 / unlike_0
ratio4 = amount_4 / unlike_0

print(f"0: {amount_0}\t\t1: {amount_1}\t\t2: {amount_2}\t\t3: {amount_3}\t\t4: {amount_4}")

df_1_under = X_train_back[X_train_back['prediction'] == 1].sample(round(ratio1 * amount_0), replace=True)
df_2_under = X_train_back[X_train_back['prediction'] == 2].sample(round(ratio2 * amount_0), replace=True)
df_3_under = X_train_back[X_train_back['prediction'] == 3].sample(round(ratio3 * amount_0), replace=True)
df_4_under = X_train_back[X_train_back['prediction'] == 4].sample(round(ratio4 * amount_0), replace=True)

X_train_back_balanced_over = pd.concat([df_1_under, df_2_under, df_3_under, df_4_under, X_train_back[X_train_back['prediction'] == 0]], axis=0)

X_train_back_balanced_over['prediction'].value_counts()

0: 52883		1: 3030		2: 2677		3: 2798		4: 2855


0.0    52883
1.0    14105
4.0    13291
3.0    13025
2.0    12462
Name: prediction, dtype: int64

In [21]:
y_train_back = X_train_back_balanced_over['prediction']


In [29]:
y_test_back = X_test_back['prediction']

In [24]:
forest = RandomForestClassifier()
params = {'criterion': ['gini', 'entropy'],
         'n_estimators': np.arange(100, 104),
         'min_samples_split': np.arange(5, 8),
         'max_depth': np.arange(16, 19)}
clf = GridSearchCV(estimator = forest, param_grid = params, error_score = 'raise', n_jobs=-1)
clf.fit(X_train_back_balanced_over.drop('prediction', axis=1), y_train_back)
clf.best_params_

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_split': 5,
 'n_estimators': 100}

In [25]:
y_pred_back = clf.predict(X_test_back.drop('prediction', axis=1))

In [30]:
cm_back = confusion_matrix(y_test_back, y_pred_back)
print('Confusion matrix:')
print(cm_back)

ac_back= accuracy_score(y_test_back, y_pred_back)
print(f"Accuracy score: {ac_back}")

dmc_back = multiclassScore(y_test_back, pd.Series(y_pred_back))
print(f"DMC score: {dmc_back}")

Confusion matrix:
[[22308   133    42    34    22]
 [ 1173   135    18     9     2]
 [ 1040    54    62     5     3]
 [ 1078    53    14    54     4]
 [ 1218    28     4     3    38]]
Accuracy score: 0.8206944141788335
DMC score: 23372


# Using model builted on background data to predict Submission file

In [80]:
submission = pd.read_csv('submission.csv', sep='|').drop('prediction', axis=1)
submission

Unnamed: 0,userID,itemID
0,0,20664
1,0,28231
2,13,2690
3,15,1299
4,15,20968
...,...,...
9995,46118,20106
9996,46124,19677
9997,46125,12878
9998,46127,7963


### import feature datasets up to 31.1.2021

In [86]:
ui_31 = pd.read_csv('wholeJanUserItemFeatures.csv', sep='|')
ui_feat_31 = ui_31.drop('buysLast14d', axis=1)
ui_feat_31.rename(columns={'usageTime': 'usage_time_items'}, inplace = True)
u_31 = pd.read_csv('U_FEAT_till_31_1.csv', index_col=0)
u_feat_31 = u_31.drop(columns = ['u_EF'])
ui_feat_31

Unnamed: 0,userID,itemID,similar>0COrderSumIncl,similar>0COrderCountIncl,similar>0COrderSumExcl,similar>0COrderCountExcl,sameBrandOrderSumIncl,sameBrandOrderCountIncl,sameBrandOrderSumExcl,sameBrandOrderCountExcl,cumsum,avg_time_orders,day_first_purchase,day_last_purchase,usage_time_items
0,0,1505,2,2,1,1,2,2,1,1,1,0.0,92,92,0
1,0,6446,3,3,1,1,2,2,0,0,2,35.0,193,228,35
2,0,9325,6,5,5,4,1,1,0,0,1,0.0,172,172,0
3,0,12468,1,1,0,0,1,1,0,0,1,0.0,63,63,0
4,0,12505,2,2,1,1,1,1,0,0,1,0.0,78,78,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
896421,46137,22403,1,1,0,0,2,2,1,1,1,0.0,231,231,0
896422,46137,22583,1,1,0,0,1,1,0,0,1,0.0,244,244,0
896423,46137,28343,3,2,2,1,1,1,0,0,1,0.0,68,68,0
896424,46137,28900,6,4,4,3,2,1,0,0,2,0.0,68,68,0


In [63]:
ui_feat_31.columns

Index(['userID', 'itemID', 'similar>0COrderSumIncl',
       'similar>0COrderCountIncl', 'similar>0COrderSumExcl',
       'similar>0COrderCountExcl', 'sameBrandOrderSumIncl',
       'sameBrandOrderCountIncl', 'sameBrandOrderSumExcl',
       'sameBrandOrderCountExcl', 'cumsum', 'avg_time_orders',
       'day_first_purchase', 'day_last_purchase', 'usage_time_items'],
      dtype='object')

In [64]:
ui_feat.columns

Index(['userID', 'itemID', 'cumsum', 'similar>0COrderSumIncl',
       'similar>0COrderCountIncl', 'similar>0COrderSumExcl',
       'similar>0COrderCountExcl', 'sameBrandOrderSumIncl',
       'sameBrandOrderCountIncl', 'sameBrandOrderSumExcl',
       'sameBrandOrderCountExcl', 'avg_time_orders', 'day_first_purchase',
       'day_last_purchase', 'usage_time_items'],
      dtype='object')

In [65]:
ui_feat

Unnamed: 0,userID,itemID,cumsum,similar>0COrderSumIncl,similar>0COrderCountIncl,similar>0COrderSumExcl,similar>0COrderCountExcl,sameBrandOrderSumIncl,sameBrandOrderCountIncl,sameBrandOrderSumExcl,sameBrandOrderCountExcl,avg_time_orders,day_first_purchase,day_last_purchase,usage_time_items
0,1199,637,5,5,2,0,0,6,3,1,1,126.000000,14,140,42.000000
1,21557,3520,2,3,3,1,1,3,3,1,1,83.000000,72,155,83.000000
2,23387,1633,2,2,2,0,0,2,2,0,0,124.000000,31,155,124.000000
3,32069,29311,5,9,7,4,4,5,3,0,0,28.000000,135,191,21.250000
4,30359,11124,2,11,11,9,9,2,2,0,0,10.000000,119,129,10.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27529,2176,9773,2,5,5,3,3,4,4,2,2,42.000000,138,180,42.000000
27530,20041,19824,4,4,4,0,0,4,4,0,0,53.333333,25,185,53.333333
27531,39920,18714,4,4,2,0,0,4,2,0,0,159.000000,23,182,79.500000
27532,5127,1299,3,6,6,3,3,5,5,2,2,47.500000,118,213,47.500000


In [87]:
sub_feat = (submission.merge(ui_feat_31, how='left', on=['userID', 'itemID'])).merge(u_feat_31, how='left', on='userID')
sub_feat

Unnamed: 0,userID,itemID,similar>0COrderSumIncl,similar>0COrderCountIncl,similar>0COrderSumExcl,similar>0COrderCountExcl,sameBrandOrderSumIncl,sameBrandOrderCountIncl,sameBrandOrderSumExcl,sameBrandOrderCountExcl,...,day_first_purchase,day_last_purchase,usage_time_items,u_avg_orders,u_avg_period,u_first_bought,u_last_bought,u_std_avg_period,u_mean_bought,u_std_bought
0,0,20664,3,3,0,0,3,3,0,0,...,4,193,94,2.00,26.00,5,239,16.98,139.40,73.45
1,0,28231,5,4,1,1,6,4,2,1,...,172,238,33,2.00,26.00,5,239,16.98,139.40,73.45
2,13,2690,8,6,4,2,4,4,0,0,...,5,206,67,5.58,21.64,6,244,14.04,118.50,72.11
3,15,1299,7,7,3,3,5,5,1,1,...,100,227,42,4.43,16.62,23,239,7.85,147.36,66.44
4,15,20968,17,15,13,11,4,4,0,0,...,129,238,36,4.43,16.62,23,239,7.85,147.36,66.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,46118,20106,6,6,1,1,5,5,0,0,...,36,205,42,3.55,12.21,3,235,9.51,101.45,73.71
9996,46124,19677,13,13,7,7,12,12,6,6,...,68,236,33,2.91,18.50,52,237,9.50,139.64,54.14
9997,46125,12878,3,3,0,0,5,5,2,2,...,3,244,120,2.83,48.20,4,245,32.85,151.33,87.69
9998,46127,7963,8,5,0,0,8,5,0,0,...,36,196,25,4.00,20.00,37,197,17.44,112.56,59.72


In [88]:
sub_feat.isnull().any()


userID                      False
itemID                      False
similar>0COrderSumIncl      False
similar>0COrderCountIncl    False
similar>0COrderSumExcl      False
similar>0COrderCountExcl    False
sameBrandOrderSumIncl       False
sameBrandOrderCountIncl     False
sameBrandOrderSumExcl       False
sameBrandOrderCountExcl     False
cumsum                      False
avg_time_orders             False
day_first_purchase          False
day_last_purchase           False
usage_time_items            False
u_avg_orders                False
u_avg_period                False
u_first_bought              False
u_last_bought               False
u_std_avg_period            False
u_mean_bought               False
u_std_bought                False
dtype: bool

In [89]:
submission['prediction_23372'] = clf.predict(sub_feat)

In [94]:
submission['prediction_ranfor_23372'] = submission.apply(lambda row: int(row['prediction_23372']), axis=1)
sub_df = submission.drop('prediction_23372', axis=1)
sub_df

Unnamed: 0,userID,itemID,prediction_ranfor_23372
0,0,20664,0
1,0,28231,0
2,13,2690,0
3,15,1299,0
4,15,20968,0
...,...,...,...
9995,46118,20106,0
9996,46124,19677,0
9997,46125,12878,0
9998,46127,7963,0


In [95]:
sub_df.prediction_ranfor_23372.value_counts()

0    9989
2       5
4       4
1       1
3       1
Name: prediction_ranfor_23372, dtype: int64

In [98]:
ui_sub = pd.read_csv('UI_submission_predictions.csv', sep='|', index_col=0)
ui_sub

Unnamed: 0,userID,itemID,prediction
0,0,20664,0.0
1,0,28231,0.0
2,13,2690,0.0
3,15,1299,0.0
4,15,20968,0.0
...,...,...,...
9995,46118,20106,3.0
9996,46124,19677,0.0
9997,46125,12878,0.0
9998,46127,7963,0.0


In [99]:
ui_sub.prediction.value_counts()

0.0    9243
4.0     287
3.0     248
1.0     131
2.0      91
Name: prediction, dtype: int64