In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
train_set = pd.read_csv('train_70_backgroundTrainTestsplit.csv')
test_set = pd.read_csv('test_30_backgroundTrainTestsplit.csv')
binary_train = pd.read_csv('binary_train_70_backgroundTrainTestsplit.csv')
binary_test = pd.read_csv('binary_test_30_backgroundTrainTestsplit.csv')
backdata = pd.read_csv('backgroundData_backGroundTrainTestsplit.csv')


### Merging training set and test set into one file only, so that we can merge the file with features easier

In [3]:
binary_train_test_df = pd.concat([binary_train, binary_test], axis = 0)
train_test_df = pd.concat([train_set, test_set], axis = 0)
train_test_df

Unnamed: 0,userID,itemID,prediction
0,30519,28724,2.0
1,22872,31702,0.0
2,37164,26663,0.0
3,35621,13228,0.0
4,23942,1781,0.0
...,...,...,...
27529,32155,10507,0.0
27530,326,27989,0.0
27531,10580,18630,0.0
27532,42548,352,0.0


In [4]:
check = train_test_df[train_test_df.duplicated(['userID', 'itemID'])]
check

Unnamed: 0,userID,itemID,prediction


In [5]:
features = pd.read_csv('average_cycle_first_last_weeks.csv', index_col = 0)
features

  mask |= (ar1 == a)


Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks,weeks_bought_item,average_cycle(weeks),first_week_bought,last_week_bought
0,0,1505,2020-09-01,9,93,14,[14],0.0,14,14
1,0,6446,2020-12-11,12,194,28,"[28, 33]",5.0,28,33
2,0,6446,2021-01-15,1,229,33,"[28, 33]",5.0,28,33
3,0,9325,2020-11-20,11,173,25,[25],0.0,25,25
4,0,12468,2020-08-03,8,64,10,[10],0.0,10,10
...,...,...,...,...,...,...,...,...,...,...
1071015,46137,22403,2021-01-18,1,232,34,[34],0.0,34,34
1071016,46137,22583,2021-01-31,1,245,35,[35],0.0,35,35
1071017,46137,28343,2020-08-08,8,69,10,[10],0.0,10,10
1071018,46137,28900,2020-08-08,8,69,10,[10],0.0,10,10


### Removing all duplicated data (userID, itemID) except for the first one, so that we can merge correctly with train_test_set

In [6]:
features_without_duplicate = features.drop_duplicates(['userID', 'itemID'], keep='first')
features_without_duplicate

Unnamed: 0,userID,itemID,date,month,timeDeltaDays,timeDeltaWeeks,weeks_bought_item,average_cycle(weeks),first_week_bought,last_week_bought
0,0,1505,2020-09-01,9,93,14,[14],0.0,14,14
1,0,6446,2020-12-11,12,194,28,"[28, 33]",5.0,28,33
3,0,9325,2020-11-20,11,173,25,[25],0.0,25,25
4,0,12468,2020-08-03,8,64,10,[10],0.0,10,10
5,0,12505,2020-08-18,8,79,12,[12],0.0,12,12
...,...,...,...,...,...,...,...,...,...,...
1071015,46137,22403,2021-01-18,1,232,34,[34],0.0,34,34
1071016,46137,22583,2021-01-31,1,245,35,[35],0.0,35,35
1071017,46137,28343,2020-08-08,8,69,10,[10],0.0,10,10
1071018,46137,28900,2020-08-08,8,69,10,[10],0.0,10,10


In [7]:
#IU_FEAT_AverageCycle = features_without_duplicate.drop(columns=['date', 'month', 'timeDeltaDays', 'timeDeltaWeeks', 'weeks_bought_item'])
#IU_FEAT_AverageCycle


Unnamed: 0,userID,itemID,average_cycle(weeks),first_week_bought,last_week_bought
0,0,1505,0.0,14,14
1,0,6446,5.0,28,33
3,0,9325,0.0,25,25
4,0,12468,0.0,10,10
5,0,12505,0.0,12,12
...,...,...,...,...,...
1071015,46137,22403,0.0,34,34
1071016,46137,22583,0.0,35,35
1071017,46137,28343,0.0,10,10
1071018,46137,28900,0.0,10,10


In [8]:
#IU_FEAT_AverageCycle.to_csv('IU_FEAT_AverageCycle.csv')

### Merging both train_test_df and features together, so that we get more features to build the model
Using train_test_df to build the model

In [35]:
df = train_test_df.merge(features_without_duplicate, how='inner', on=['userID', 'itemID'])
df

Unnamed: 0,userID,itemID,prediction,date,month,timeDeltaDays,timeDeltaWeeks,weeks_bought_item,average_cycle(weeks),first_week_bought,last_week_bought
0,30519,28724,2.0,2020-07-21,7,51,8,"[8, 27, 33]",12.5,8,33
1,22872,31702,0.0,2020-07-31,7,61,9,"[9, 18, 27]",9.0,9,27
2,37164,26663,0.0,2020-07-02,7,32,5,"[5, 23]",18.0,5,23
3,35621,13228,0.0,2020-08-30,8,91,13,"[13, 25, 26]",6.5,13,26
4,23942,1781,0.0,2020-06-30,6,30,5,"[5, 27]",22.0,5,27
...,...,...,...,...,...,...,...,...,...,...,...
91772,32155,10507,0.0,2020-07-04,7,34,5,"[5, 30]",25.0,5,30
91773,326,27989,0.0,2020-11-02,11,155,23,"[23, 24, 28]",2.5,23,28
91774,10580,18630,0.0,2020-06-22,6,22,4,"[4, 12, 27]",11.5,4,27
91775,42548,352,0.0,2020-08-10,8,71,11,"[11, 28]",17.0,11,28


# Training Model
- using features: userID, itemID, average_cycle(weeks),first_week_bought,last_week_bought
- y is prediction column

In [36]:
data = df.drop(columns=['date', 'month', 'timeDeltaDays', 'timeDeltaWeeks', 'weeks_bought_item'])
data

Unnamed: 0,userID,itemID,prediction,average_cycle(weeks),first_week_bought,last_week_bought
0,30519,28724,2.0,12.5,8,33
1,22872,31702,0.0,9.0,9,27
2,37164,26663,0.0,18.0,5,23
3,35621,13228,0.0,6.5,13,26
4,23942,1781,0.0,22.0,5,27
...,...,...,...,...,...,...
91772,32155,10507,0.0,25.0,5,30
91773,326,27989,0.0,2.5,23,28
91774,10580,18630,0.0,11.5,4,27
91775,42548,352,0.0,17.0,11,28


In [37]:
# move the column prediction to the end of dataframe and rename it with predictions
data_copy = data.copy()
data_copy['predictions'] = data_copy['prediction'].astype('int')
dataset = data_copy.drop(columns=['prediction'])
dataset

Unnamed: 0,userID,itemID,average_cycle(weeks),first_week_bought,last_week_bought,predictions
0,30519,28724,12.5,8,33,2
1,22872,31702,9.0,9,27,0
2,37164,26663,18.0,5,23,0
3,35621,13228,6.5,13,26,0
4,23942,1781,22.0,5,27,0
...,...,...,...,...,...,...
91772,32155,10507,25.0,5,30,0
91773,326,27989,2.5,23,28,0
91774,10580,18630,11.5,4,27,0
91775,42548,352,17.0,11,28,0


### Getting X and y values from dataset

In [38]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

### Splitting the dataset into the Training set and Test set

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [40]:
print(X_train)

[[3.9038e+04 1.7268e+04 7.0000e+00 2.4000e+01 3.1000e+01]
 [3.1950e+03 1.6199e+04 4.0000e+00 1.5000e+01 1.9000e+01]
 [1.0990e+04 1.8630e+04 8.3300e+00 7.0000e+00 3.2000e+01]
 ...
 [1.5105e+04 4.5500e+03 1.0500e+01 1.4000e+01 3.5000e+01]
 [3.1405e+04 1.0770e+04 5.5000e+00 7.0000e+00 1.8000e+01]
 [3.9681e+04 8.2740e+03 5.5000e+00 2.0000e+00 3.5000e+01]]


In [41]:
print(X_test)

[[3.1821e+04 2.4280e+03 2.0000e+01 1.0000e+01 3.0000e+01]
 [1.9629e+04 5.0760e+03 2.3000e+01 8.0000e+00 3.1000e+01]
 [4.4305e+04 8.4620e+03 6.0000e+00 1.0000e+00 7.0000e+00]
 ...
 [4.0169e+04 1.1251e+04 1.0500e+01 4.0000e+00 2.5000e+01]
 [3.2041e+04 1.1009e+04 1.2000e+01 1.0000e+00 2.5000e+01]
 [3.6343e+04 1.1735e+04 0.0000e+00 7.0000e+00 7.0000e+00]]


In [42]:
print(y_train)

[0 0 1 ... 4 0 4]


In [43]:
print(y_test)

[0 0 0 ... 0 0 0]


### Feature scaling

In [44]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [45]:
print(X_train)

[[ 1.19210134  0.06959389 -0.66812204  2.27684235  0.837701  ]
 [-1.49819766 -0.04316086 -1.18173512  0.90097442 -0.85027317]
 [-0.91312161  0.21325336 -0.44042025 -0.3220193   0.97836551]
 ...
 [-0.6042585  -1.27186065 -0.06890679  0.7481002   1.40035905]
 [ 0.61918467 -0.61579466 -0.92492858 -0.3220193  -0.99093769]
 [ 1.24036355 -0.87906487 -0.92492858 -1.08639037  1.40035905]]


In [46]:
print(X_test)

[[ 0.65040874 -1.49568252  1.5575346   0.13660335  0.69703648]
 [-0.26469673 -1.21637983  2.07114767 -0.16914508  0.837701  ]
 [ 1.58743111 -0.85923522 -0.8393264  -1.23926459 -2.53824734]
 ...
 [ 1.27699179 -0.5650603  -0.06890679 -0.78064194 -0.00628609]
 [ 0.66692147 -0.59058569  0.18789974 -1.23926459 -0.00628609]
 [ 0.9898204  -0.5140095  -1.86655254 -0.3220193  -2.53824734]]


# I. Random Forest Classification Model

### Training the Random Forest Classification model on the Training set

In [47]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

### Predicting the test set result

In [48]:
y_pred = classifier.predict(X_test)
# concatenating y_pred and y_test together
pred_test_y = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis = 1)
pred_test_y

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 0],
       [0, 0]])

### Making confusion matrix and calculating accuracy score

In [49]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[18864     0     0     0     0]
 [    0   974     1     0     0]
 [    0     0   901     0     0]
 [    0     0     0  1012     2]
 [    0     0     0     0  1191]]


0.9998692525604707

# II. Decision Tree Classification Model 

### Training the Decision Tree Classification model on the Training set

In [50]:
from sklearn.tree import DecisionTreeClassifier
cl_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
cl_tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

### Predicting the test set result

In [51]:
y_pred_tree = cl_tree.predict(X_test)

matrix_pred_test = np.concatenate((y_pred_tree.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1)
print(matrix_pred_test)

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


### Making the confusion matrix

In [52]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_tree = confusion_matrix(y_test, y_pred_tree)
ac_tree = accuracy_score(y_test, y_pred_tree)
print(cm_tree)
print(ac_tree)

[[18864     0     0     0     0]
 [    0   975     0     0     0]
 [    0     0   901     0     0]
 [    0     0     0  1014     0]
 [    0     0     0     0  1191]]
1.0


# III. Support Vector Machine Model

### Training the SVM model on the Training set

In [53]:
from sklearn.svm import SVC
cl_svm = SVC(kernel='linear', random_state=0)
cl_svm.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

### Predicting the test set result

In [54]:
y_pred_svm = cl_svm.predict(X_test)

### Making confusion matrix and calculating accuracy score

In [55]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm_svm = confusion_matrix(y_test, y_pred_svm)
ac_svm = accuracy_score(y_test, y_pred_svm)
print(cm_svm)
print(ac_svm)

[[18864     0     0     0     0]
 [    0   975     0     0     0]
 [    0     0   901     0     0]
 [    0     0     0  1014     0]
 [    0     0     0     0  1191]]
1.0
