In [1]:
# Data manipulation
import pandas as pd
import numpy as np
# Manipulation with dates
from datetime import date
from dateutil.relativedelta import relativedelta
# Machine learning
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.DataFrame({'Date': ['2022-08-01','2022-06-29',
                     '2022-06-12','2022-05-15','2022-04-18',
                     '2022-03-18','2022-02-19','2022-01-21',
                     '2021-12-21','2021-11-22','2021-10-17',
                     '2021-09-20','2021-08-25','2021-07-29',
                     '2021-06-30','2021-06-03','2021-05-08',
                     '2021-04-03','2021-03-03','2021-02-03',
                     '2021-01-05','2020-12-07','2020-11-09', 
                     '2020-10-10','2020-09-13','2020-08-17',
                     '2020-07-20','2020-06-21','2020-05-25',
                     '2020-04-27','2020-03-30','2020-02-29',
                     '2020-01-29','2019-12-23','2019-11-21',
                     '2019-10-24','2019-09-27','2019-08-29']})

In [3]:
data['Date'] = pd.to_datetime(data['Date'])
data['Release'] = 1

In [4]:
r = pd.date_range(start=data['Date'].min(), end=data['Date'].max())
data = data.set_index('Date').reindex(r).fillna(0.0).rename_axis('Date').reset_index()

In [5]:
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['Workday_N'] = np.busday_count(
                    data['Date'].values.astype('datetime64[M]'),
                    data['Date'].values.astype('datetime64[D]'))
data['Week_day'] = data['Date'].dt.weekday
data['Week_of_month'] = (data['Date'].dt.day 
                         - data['Date'].dt.weekday - 2) // 7 + 2
data['Weekday_order'] = (data['Date'].dt.day + 6) // 7
data = data.set_index('Date')

In [6]:
x_train, x_test, y_train, y_test = train_test_split(data.drop(['Release'], axis=1), data['Release'],
                 test_size=0.3, random_state=1, shuffle=False)

In [7]:
DM_train = xgb.DMatrix(data=x_train, label=y_train)
grid_param = {"learning_rate": [0.01, 0.1],
              "n_estimators": [100, 150, 200],
              "alpha": [0.1, 0.5, 1],
              "max_depth": [2, 3, 4]}
model = xgb.XGBRegressor()
grid_mse = GridSearchCV(estimator=model, param_grid=grid_param,
                       scoring="neg_mean_squared_error",
                       cv=4, verbose=1)
grid_mse.fit(x_train, y_train)
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 4 folds for each of 54 candidates, totalling 216 fits
Best parameters found:  {'alpha': 1, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}
Lowest RMSE found:  0.1875476520514971


In [8]:
xgb_model = xgb.XGBClassifier(objective ='reg:squarederror', 
                            colsample_bytree = 1, 
                            learning_rate = 0.1,
                            max_depth = 2, 
                            alpha = 1, 
                            n_estimators = 100)
xgb_model.fit(x_train, y_train)
xgb_prediction = xgb_model.predict(x_test)

In [9]:
print(xgb_prediction)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [10]:
knn = KNeighborsClassifier(n_neighbors = 3, algorithm = 'auto',     
                           weights = 'distance') 
knn.fit(x_train, y_train)  
knn_prediction = knn.predict(x_test)

In [11]:
print(knn_prediction)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0.]


In [12]:
random_forest = RandomForestClassifier(n_estimators=50,
                                       max_depth=10, random_state=1)
random_forest.fit(x_train, y_train)
rf_prediction = random_forest.predict(x_test)

In [13]:
xgb_matrix = metrics.confusion_matrix(xgb_prediction, y_test)
print(f"""
Confusion matrix for XGBoost model:
TN:{xgb_matrix[0][0]}    FN:{xgb_matrix[0][1]}
FP:{xgb_matrix[1][0]}    TP:{xgb_matrix[1][1]}""")
knn_matrix = metrics.confusion_matrix(knn_prediction, y_test)
print(f"""
Confusion matrix for KNN model:
TN:{knn_matrix[0][0]}    FN:{knn_matrix[0][1]}
FP:{knn_matrix[1][0]}    TP:{knn_matrix[1][1]}""")
rf_matrix = metrics.confusion_matrix(rf_prediction, y_test)
print(f"""
Confusion matrix for Random Forest model:
TN:{rf_matrix[0][0]}    FN:{rf_matrix[0][1]}
FP:{rf_matrix[1][0]}    TP:{rf_matrix[1][1]}""")


Confusion matrix for XGBoost model:
TN:309    FN:12
FP:0    TP:0

Confusion matrix for KNN model:
TN:306    FN:12
FP:3    TP:0

Confusion matrix for Random Forest model:
TN:306    FN:12
FP:3    TP:0


In [14]:
x_predict = pd.DataFrame(pd.date_range(date.today(), (date.today() +
            relativedelta(years=1)),freq='d'), columns=['Date'])
x_predict['Day'] = x_predict['Date'].dt.day
x_predict['Workday_N'] = np.busday_count(
                x_predict['Date'].values.astype('datetime64[M]'),
                x_predict['Date'].values.astype('datetime64[D]'))
x_predict['Week_day'] = x_predict['Date'].dt.weekday
x_predict['Week_of_month'] = (x_predict['Date'].dt.day - 
                              x_predict['Date'].dt.weekday - 2)//7+2
x_predict['Weekday_order'] = (x_predict['Date'].dt.day + 6) // 7
x_predict['Month'] = x_predict['Date'].dt.month
x_predict = x_predict.set_index('Date')
prediction = xgb_model.predict(x_predict)

In [15]:
print(prediction)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [16]:
print(x_predict)

            Day  Workday_N  Week_day  Week_of_month  Weekday_order  Month
Date                                                                     
2022-08-26   26         19         4              4              4      8
2022-08-27   27         20         5              4              4      8
2022-08-28   28         20         6              4              4      8
2022-08-29   29         20         0              5              5      8
2022-08-30   30         21         1              5              5      8
...         ...        ...       ...            ...            ...    ...
2023-08-22   22         15         1              4              4      8
2023-08-23   23         16         2              4              4      8
2023-08-24   24         17         3              4              4      8
2023-08-25   25         18         4              4              4      8
2023-08-26   26         19         5              4              4      8

[366 rows x 6 columns]


In [17]:
x_predict['preds'] = prediction

In [18]:
display(x_predict)

Unnamed: 0_level_0,Day,Workday_N,Week_day,Week_of_month,Weekday_order,Month,preds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-08-26,26,19,4,4,4,8,0
2022-08-27,27,20,5,4,4,8,0
2022-08-28,28,20,6,4,4,8,0
2022-08-29,29,20,0,5,5,8,0
2022-08-30,30,21,1,5,5,8,0
...,...,...,...,...,...,...,...
2023-08-22,22,15,1,4,4,8,0
2023-08-23,23,16,2,4,4,8,0
2023-08-24,24,17,3,4,4,8,0
2023-08-25,25,18,4,4,4,8,0


In [19]:
x_predict.to_csv('preds', sep='\t')

In [20]:
knn_predict = pd.DataFrame(pd.date_range(date.today(), (date.today() +
            relativedelta(years=1)),freq='d'), columns=['Date'])
knn_predict['Day'] = knn_predict['Date'].dt.day
knn_predict['Workday_N'] = np.busday_count(
                knn_predict['Date'].values.astype('datetime64[M]'),
                knn_predict['Date'].values.astype('datetime64[D]'))
knn_predict['Week_day'] = knn_predict['Date'].dt.weekday
knn_predict['Week_of_month'] = (knn_predict['Date'].dt.day - 
                              knn_predict['Date'].dt.weekday - 2)//7+2
knn_predict['Weekday_order'] = (knn_predict['Date'].dt.day + 6) // 7
knn_predict['Month'] = knn_predict['Date'].dt.month
knn_predict = knn_predict.set_index('Date')
prediction = knn.predict(knn_predict)

Feature names must be in the same order as they were in fit.



In [21]:
knn_predict['preds'] = prediction
display(knn_predict)

Unnamed: 0_level_0,Day,Workday_N,Week_day,Week_of_month,Weekday_order,Month,preds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-08-26,26,19,4,4,4,8,0.0
2022-08-27,27,20,5,4,4,8,0.0
2022-08-28,28,20,6,4,4,8,0.0
2022-08-29,29,20,0,5,5,8,0.0
2022-08-30,30,21,1,5,5,8,0.0
...,...,...,...,...,...,...,...
2023-08-22,22,15,1,4,4,8,0.0
2023-08-23,23,16,2,4,4,8,0.0
2023-08-24,24,17,3,4,4,8,0.0
2023-08-25,25,18,4,4,4,8,0.0


In [22]:
knn_predict.to_csv('knn-preds', sep='\t')

In [23]:
rf_predict = pd.DataFrame(pd.date_range(date.today(), (date.today() +
            relativedelta(years=1)),freq='d'), columns=['Date'])
rf_predict['Day'] = rf_predict['Date'].dt.day
rf_predict['Workday_N'] = np.busday_count(
                rf_predict['Date'].values.astype('datetime64[M]'),
                rf_predict['Date'].values.astype('datetime64[D]'))
rf_predict['Week_day'] = rf_predict['Date'].dt.weekday
rf_predict['Week_of_month'] = (rf_predict['Date'].dt.day - 
                              rf_predict['Date'].dt.weekday - 2)//7+2
rf_predict['Weekday_order'] = (rf_predict['Date'].dt.day + 6) // 7
rf_predict['Month'] = rf_predict['Date'].dt.month
rf_predict = rf_predict.set_index('Date')
prediction = random_forest.predict(rf_predict)

Feature names must be in the same order as they were in fit.



In [24]:
rf_predict['preds'] = prediction
display(rf_predict)

Unnamed: 0_level_0,Day,Workday_N,Week_day,Week_of_month,Weekday_order,Month,preds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-08-26,26,19,4,4,4,8,0.0
2022-08-27,27,20,5,4,4,8,0.0
2022-08-28,28,20,6,4,4,8,0.0
2022-08-29,29,20,0,5,5,8,0.0
2022-08-30,30,21,1,5,5,8,0.0
...,...,...,...,...,...,...,...
2023-08-22,22,15,1,4,4,8,0.0
2023-08-23,23,16,2,4,4,8,0.0
2023-08-24,24,17,3,4,4,8,0.0
2023-08-25,25,18,4,4,4,8,0.0


In [25]:
rf_predict.to_csv('rf-preds', sep='\t')