In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

## Import scoring function

In [25]:
# Code from Lars Grönberg
def multiclassScore(y: pd.Series, y_pred: pd.Series,normalize :bool = False) -> float:
    """Calculates the score according to the DMC22 evalution rules for a multiclass prediction and response

    Args:
        y (pd.Series): true y
        y_pred (pd.Series): predicted y
        normalize (bool): normalizes the score to a value in [0,1]. Defaults to false.

    Returns:
        int: Score according to the DMC22 evaluation if normalize is false.
        float: Score according to the DMC22 evaluation if normalize is true.
    
    Example:
        >>> ytrue = pd.Series([3,4,0,1,4,2])
        >>> ypred = pd.Series([4,4,0.0,0,0,3])
        >>> multiclassScore(ytrue,ypred)
        6

        >>> ytrue = pd.Series([3,4,0,1,4,2])
        >>> ypred = pd.Series([4,4,0.0,0,0,3])
        >>> multiclassScore(ytrue,ypred,normalize=True)
        0.375

        >>> ytrue = pd.Series([3,4,0,1,4,2])
        >>> ypred = pd.Series([3,4,0,1,4,2])
        >>> multiclassScore(ytrue,ypred)
        16

        >>> ytrue = pd.Series([3,4,0,1,4,2])
        >>> ypred = pd.Series([3,4,0,1,4,2])
        >>> multiclassScore(ytrue,ypred,normalize=True)
        1.0

    """

    if (y.astype(int).apply(lambda x: x in [0,1,2,3,4]).all() == False) or (y_pred.astype(int).apply(lambda x: x in [0,1,2,3,4]).all() == False) :
        raise ValueError('y and y_pred are only allowed to contain the elements 0,1,2,3,4')


    behaviourY = np.where(y>0,1,0) #1 is buying
    behaviourPredY = np.where(y_pred>0,1,0)
    correctBehaviour = np.sum(behaviourY == behaviourPredY)

    correctPredictionsWeek = np.sum((y == y_pred) & (y > 0))

    score = (correctBehaviour-correctPredictionsWeek)*1+correctPredictionsWeek*3

    if normalize == True:
        maxScore = np.sum((y>0)*3 + (y==0)*1)
        return(score/maxScore)
    else:
        return(score)


## Import dataset

In [26]:
train_set = pd.read_csv('train_70_backgroundTrainTestsplit.csv')
test_set = pd.read_csv('test_30_backgroundTrainTestsplit.csv')

In [27]:
train_set

Unnamed: 0,userID,itemID,prediction
0,45301,461,2.0
1,21509,18569,0.0
2,26600,30712,0.0
3,2,23476,4.0
4,24698,16294,0.0
...,...,...,...
64238,20305,21387,0.0
64239,13714,19959,0.0
64240,22788,8771,0.0
64241,45935,25373,0.0


In [28]:
test_set

Unnamed: 0,userID,itemID,prediction
0,27630,29657,0.0
1,45037,15445,0.0
2,40558,24405,0.0
3,40577,17489,0.0
4,2449,5557,0.0
...,...,...,...
27529,28263,2787,0.0
27530,16224,8302,0.0
27531,11328,7546,3.0
27532,8262,11434,2.0


## Split train_set and test_set into X_train, y_train, X_test and y_test

In [35]:
X_t = train_set.drop(columns=['prediction'])
y_train = train_set.prediction

X_te = test_set.drop(columns = ['prediction'])
y_test = test_set.prediction

## Merging X_train, X_test with features

In [30]:
features = pd.read_csv('3_IU_FEAT_Average_Day.csv', index_col = 0).drop(columns = ['first_date_bought', 'last_date_bought'])
features

Unnamed: 0,userID,itemID,average_cycle_days,first_day_delta_bought,last_day_delta_bought
0,0,1505,0.0,93,93
1,0,6446,0.0,194,194
2,0,9325,0.0,173,173
3,0,12468,0.0,64,64
4,0,12505,0.0,79,79
...,...,...,...,...,...
919701,46137,2667,0.0,109,109
919702,46137,20209,0.0,69,69
919703,46137,28343,0.0,69,69
919704,46137,28900,0.0,69,69


In [37]:
X_train = X_t.merge(features, how = 'left', on = ['userID', 'itemID'])
X_test = X_te.merge(features, how = 'left', on = ['userID', 'itemID'])
X_train

Unnamed: 0,userID,itemID,average_cycle_days,first_day_delta_bought,last_day_delta_bought
0,45301,461,83.50,18,185
1,21509,18569,125.00,78,203
2,26600,30712,63.00,13,76
3,2,23476,86.00,29,115
4,24698,16294,21.00,130,151
...,...,...,...,...,...
64238,20305,21387,161.00,13,174
64239,13714,19959,164.00,31,195
64240,22788,8771,48.00,79,127
64241,45935,25373,199.00,14,213


In [38]:
X_test

Unnamed: 0,userID,itemID,average_cycle_days,first_day_delta_bought,last_day_delta_bought
0,27630,29657,44.0,28,160
1,45037,15445,82.0,81,163
2,40558,24405,74.0,30,104
3,40577,17489,92.0,7,99
4,2449,5557,74.0,98,172
...,...,...,...,...,...
27529,28263,2787,122.0,48,170
27530,16224,8302,66.0,49,115
27531,11328,7546,54.0,129,183
27532,8262,11434,70.0,92,162


# Training models

## 0. Feature scaling

In [39]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)
X_train_sc

array([[ 1.66250589, -1.6993443 ,  0.17209164, -0.9365631 ,  0.56774257],
       [-0.12077432,  0.21022845,  1.16161041,  0.37871276,  0.97584994],
       [ 0.26081107,  1.49076423, -0.31670679, -1.04616942, -1.90357431],
       ...,
       [-0.02490951, -0.82301618, -0.67436418,  0.40063403, -0.74727009],
       [ 1.71002605,  0.92774187,  2.92605351, -1.02424816,  1.20257626],
       [ 1.09143949, -0.22508833, -1.47718579, -0.19124011, -1.47279431]])

In [40]:
X_test_sc

array([[ 3.38012593e-01,  1.37950958e+00, -7.69739483e-01,
        -7.17350457e-01,  9.26767608e-04],
       [ 1.64271832e+00, -1.19211865e-01,  1.36325896e-01,
         4.44476558e-01,  6.89446634e-02],
       [ 1.30700412e+00,  8.25661774e-01, -5.44247098e-02,
        -6.73507928e-01, -1.26874062e+00],
       ...,
       [-8.83870139e-01, -9.52198124e-01, -5.31301226e-01,
         1.49669725e+00,  5.22397302e-01],
       [-1.11367584e+00, -5.42190455e-01, -1.49800013e-01,
         6.85610467e-01,  4.62720314e-02],
       [-1.63587293e+00, -1.11913283e+00,  1.36325896e-01,
         6.63689203e-01,  2.95670983e-01]])

## I.Random Forest Classification Model

### 1. Training model based on X_train_sc

In [42]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
clf.fit(X_train_sc, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

### 2. Predicting the test set

In [43]:
y_pred = clf.predict(X_test_sc)
y_pred

array([3., 0., 0., ..., 0., 0., 0.])

### 3. Making confusion matrix and calculating accuracy score, precision, recall, f1_score

In [52]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:")
print(cm)
ac = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {ac}")

preci = precision_score(y_test, y_pred, average = None)
print(f"Precision score: {preci}")

recall = recall_score(y_test, y_pred, average = None)
print(f"Recall score: {recall}")

f1_sc = f1_score(y_test, y_pred, average = None)
print(f"f1 score: {f1_sc}")



Confusion matrix:
[[22187   106    85   119   143]
 [ 1076    45    17    23    16]
 [ 1032    14    38    28    15]
 [ 1111    16    20    43    19]
 [ 1255    23    17    33    53]]
Accuracy score: 0.8123047868090361
Precision score: [0.83218934 0.22058824 0.21468927 0.17479675 0.21544715]
Recall score: [0.97999117 0.0382328  0.03371783 0.03556658 0.03837799]
f1 score: [0.90006288 0.06517017 0.05828221 0.05910653 0.06515058]


### 3. Applying multiclassScore() function to calculate the score of the model

In [48]:
score = multiclassScore(y_test, pd.Series(y_pred), normalize = 'False')
score

22965