In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")


In [34]:
#df = pd.read_csv("../Data/Raw/new_approach/Weekly_Covid_Data_Population_Normalized.csv")
df = pd.read_csv("../Data/Raw/new_approach/Weekly_Covid_Data.csv")

In [35]:
print(df.columns)
print(df.shape)

Index(['week_no', 'iso_code', 'new_cases', 'new_deaths', 'new_vaccinations',
       'new_people_vaccinated', 'reproduction_rate', 'stringency_index',
       'excess_mortality', 'population_density', 'median_age', 'aged_65_older',
       'aged_70_older', 'cardiovasc_death_rate', 'diabetes_prevalence',
       'female_smokers', 'male_smokers', 'hospital_beds_per_thousand',
       'life_expectancy', 'human_development_index', 'population'],
      dtype='object')
(41184, 21)


In [36]:

df.drop(["iso_code", "week_no"], axis=1, inplace=True)
#df.drop(columns=df.columns[0:1], axis=1, inplace=True)
pop = df.iloc[0]["population"]
last_country = df.iloc[234*176 - 1]
print(last_country)
cnt = df["population"].value_counts()

new_cases                     7.500000e+01
new_deaths                    2.000000e+00
new_vaccinations              0.000000e+00
new_people_vaccinated         0.000000e+00
reproduction_rate             9.500000e-01
stringency_index              8.796000e+01
excess_mortality             -1.000000e+03
population_density            4.272900e+01
median_age                    1.960000e+01
aged_65_older                 2.822000e+00
aged_70_older                 1.882000e+00
cardiovasc_death_rate         3.078460e+02
diabetes_prevalence           1.820000e+00
female_smokers                1.600000e+00
male_smokers                  3.070000e+01
hospital_beds_per_thousand    1.700000e+00
life_expectancy               6.149000e+01
human_development_index       5.710000e-01
population                    1.632054e+07
Name: 41183, dtype: float64


In [37]:
# now we want to join datas together
prediction_days_count = 2
features_list = []
X_list = []
Y_list = []

for i in range(234):  # 234 is number of countries
    for j in range(176):  # 176 is number of weeks we have
        row_list = df.loc[i * 176 + j, :].values.flatten().tolist()
        features_list.append(row_list)

for i in range((len(features_list) - prediction_days_count) + 1):
    fl = features_list[i]
    fl_len = len(features_list[i])
    this_population = features_list[i][fl_len - 1]
    for j in range(1, prediction_days_count):
        if this_population == features_list[i + j][fl_len - 1]:
            fl.extend(features_list[i + j])
        else:
            this_population = -1
            break
    if this_population != -1:
        X_list.append(fl)
        Y_list.append(features_list[(i + prediction_days_count) - 1][0])


In [38]:
print(len(X_list))
print(len(Y_list))
np.shape(X_list)

40950
40950


(40950, 38)

In [39]:
# Here we are going to normalize the data.
from sklearn import preprocessing
x_df = pd.DataFrame(X_list)
min_max_scaler_X = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler_X.fit_transform(x_df)

y_df = pd.DataFrame(Y_list)
min_max_scaler_Y = preprocessing.MinMaxScaler()
y_scaled = min_max_scaler_Y.fit_transform(y_df)

X_scaled_list = x_scaled.tolist()
Y_scaled_list = y_scaled.tolist()

#df = pd.DataFrame(x_scaled)

In [40]:
import math

def train_test_splitter(X, Y, prediction_days_count, batch_count, batch_length):
    xlen = len(X)
    xs_count_for_each_country = int(xlen / 234)
    X_train = []
    X_test = []
    Y_train = []
    Y_test = []
    steps = math.floor((xs_count_for_each_country - (batch_count * batch_length)) / (batch_count + 1)) 
    i = 0
    for i in range(234):
        j = 0
        for j in range((i * xs_count_for_each_country), (i * xs_count_for_each_country) + steps):
            X_train.append(X[j])
            Y_train.append(Y[j])
        while(j + batch_length + steps < (i+1) * xs_count_for_each_country):
            for k in range(j + prediction_days_count, j + batch_length - prediction_days_count):
                X_test.append(X[k])
                Y_test.append(Y[k])
            j += batch_length
            for k in range(j, j + steps):
                X_train.append(X[k])
                Y_train.append(Y[k])
            j += steps + 1
            while j < (i+1) * xs_count_for_each_country:
                X_train.append(X[j])
                Y_train.append(Y[j])
                j += 1
    return X_train, X_test, Y_train, Y_test
        

In [41]:
X_train, X_test, Y_train, Y_test = train_test_splitter(X_scaled_list, Y_scaled_list, prediction_days_count, 3, 10)

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [43]:
print("X_train:", np.shape(X_train),
      ",X_val:" , np.shape(X_val),
      ",X_test:", np.shape(X_test),

      ",Y_train:" ,np.shape(Y_train),
      ",Y_val:", np.shape(Y_val),
      ",Y_test:", np.shape(Y_test)
      )

X_train: (30888, 38) ,X_val: (7722, 38) ,X_test: (1404, 38) ,Y_train: (30888, 1) ,Y_val: (7722, 1) ,Y_test: (1404, 1)


In [44]:
X_tr = np.array(X_train)
y_tr = np.array(Y_train)
X_te = np.array(X_test)
y_te = np.array(Y_test)
X_val = np.array(X_val)
y_val = np.array(Y_val)

In [45]:
# Rescale predictions and true values back to original scale
def rescale_data(scaler, data):
    return scaler.inverse_transform(np.array(data).reshape(-1, 1)).flatten()

# Loss functions

In [46]:
def mean_square_error_sqrt (y_test, y_pred):
    return np.sqrt((np.sum(np.power(y_test - y_pred, 2)))/len(y_test))
def mean_absolute_error_self_defined (y_test, y_pred):
    return np.sum(np.abs(y_test - y_pred))/len(y_test)

# Modelling

## Importing loss functions

In [47]:
#from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.metrics import mean_absolute_percentage_error,mean_absolute_error, mean_squared_error, d2_tweedie_score

## Model Evaluation

In [48]:
def evaluate_model(model, X_train, y_train, X_test, y_test, scaler, model_name):
    y_pred_scaled = model.fit(X_train, y_train).predict(X_test)
    y_pred = rescale_data(scaler, y_pred_scaled)
    y_test_rescaled = rescale_data(scaler, y_test)

    # deviding by 10:
    y_pred /= 10
    y_test_rescaled /= 10

    
    mae = mean_absolute_error_self_defined (y_test_rescaled, y_pred)
    mse = mean_square_error_sqrt(y_test_rescaled, y_pred)
    # mape = mean_absolute_percentage_error(y_test_rescaled, y_pred)
    # rmse = mean_squared_error(y_test_rescaled, y_pred, squared=False)

    print(f"Evaluating {model_name}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    # print(f"MAPE: {mape}")
    # print(f"RMSE: {rmse}")
    # print(y_test_rescaled.type)
    print()
    print("Y_test:")
    print(y_test_rescaled[:10])
    print("Y_pred:")
    print(y_pred[:10])
    # plt.figure(figsize=(12, 6))
    # plt.plot(np.arange(len(y_pred)), y_pred, label='Predicted')
    # plt.plot(np.arange(len(y_pred)), y_test_rescaled, alpha=0.6, label='True')
    # plt.title(f"{model_name} Predictions vs True Values")
    # plt.legend()
    # plt.show()

## SVR

In [49]:
from sklearn.svm import SVR

In [50]:
# Train and evaluate SVR models
svrs = [SVR(kernel="rbf", C=50, gamma=0.9, epsilon=0.001),
        SVR(kernel="linear", C=100, gamma="auto"),
        SVR(kernel="poly", C=100, gamma="auto", degree=3, epsilon=0.1, coef0=1)]

for svr in svrs:
    evaluate_model(svr, X_tr, y_tr, X_val, y_val, min_max_scaler_Y, model_name=f"SVR ({svr.kernel})")


  y = column_or_1d(y, warn=True)


Evaluating SVR (rbf)
MAE: 2122.807113167494
MSE: 2561.145374187262

Y_test:
[3.6000e+00 7.2198e+03 3.3600e+01 6.3500e+01 3.1400e+01 3.8900e+01
 0.0000e+00 9.4700e+01 7.4700e+01 2.1166e+03]
Y_pred:
[ 1123.8884186   5163.17726779 -1691.07791568  -643.97556179
  3111.556483    3058.71310399  4361.75938726  -521.22916114
  3885.20390206   163.85044334]
Evaluating SVR (linear)
MAE: 365722.6127873043
MSE: 366311.52536822995

Y_test:
[3.6000e+00 7.2198e+03 3.3600e+01 6.3500e+01 3.1400e+01 3.8900e+01
 0.0000e+00 9.4700e+01 7.4700e+01 2.1166e+03]
Y_pred:
[355828.30601359 361783.57512209 354629.86934506 352732.94632289
 354006.14206784 352966.04225913 355557.55798945 354642.59763536
 404475.90866142 354417.86592499]
Evaluating SVR (poly)
MAE: 374298.7171622779
MSE: 374717.545660939

Y_test:
[3.6000e+00 7.2198e+03 3.3600e+01 6.3500e+01 3.1400e+01 3.8900e+01
 0.0000e+00 9.4700e+01 7.4700e+01 2.1166e+03]
Y_pred:
[363967.94169601 369643.19816453 362987.60584412 361339.21074755
 362548.0101547  36935

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## KNN

In [51]:
from sklearn.neighbors import KNeighborsRegressor

In [52]:
# Train and evaluate KNeighborsRegressor model
neigh = KNeighborsRegressor(n_neighbors=2, p=2)
evaluate_model(neigh, X_tr, y_tr, X_te, y_te, min_max_scaler_Y, model_name="KNeighborsRegressor")


Evaluating KNeighborsRegressor
MAE: 506.6818376068376
MSE: 1788.2716983926352

Y_test:
[ 27.3  10.5  40.6  43.8  62.7  65.7  97.2  92.  110.1 143.5]
Y_pred:
[ 16.85  28.65  92.45  13.75  13.75  22.65 208.95 133.1  386.7  386.7 ]


## Polynomial

In [53]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import LinearSVR

In [54]:
poly = PolynomialFeatures(degree=2)
X_poly_tr = poly.fit_transform(X_tr)
X_poly_te = poly.transform(X_te)

linear_model = LinearSVR(dual="auto", random_state=42)
evaluate_model(linear_model, X_poly_tr, y_tr, X_poly_te, y_te, min_max_scaler_Y, model_name="Polynomial Regression")

  y = column_or_1d(y, warn=True)


Evaluating Polynomial Regression
MAE: 46.87033423509543
MSE: 124.53693744684797

Y_test:
[ 27.3  10.5  40.6  43.8  62.7  65.7  97.2  92.  110.1 143.5]
Y_pred:
[  7.72651021  -8.02528545  16.38545864  21.40789511  38.0327891
  40.41368189 126.18954868 113.49023137 141.45102646 173.82930481]




## AdaBoost

In [55]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

In [56]:
# Train and evaluate AdaBoostRegressor model
regr = AdaBoostRegressor(random_state=0, n_estimators=100)
evaluate_model(regr, X_tr, y_tr, X_te, y_te, min_max_scaler_Y, model_name="AdaBoostRegressor")

  y = column_or_1d(y, warn=True)


Evaluating AdaBoostRegressor
MAE: 2546.0747916946098
MSE: 3900.1011898066135

Y_test:
[ 27.3  10.5  40.6  43.8  62.7  65.7  97.2  92.  110.1 143.5]
Y_pred:
[2501.71635148 2501.71635148 2501.71635148 2501.71635148 2501.71635148
 2501.71635148 2501.71635148 2501.71635148 2501.71635148 2501.71635148]


## Random Forest

In [57]:
from sklearn.ensemble import RandomForestRegressor

In [58]:
# Train and evaluate RandomForestRegressor model
regr = RandomForestRegressor(max_depth=4, random_state=0)
evaluate_model(regr, X_tr, y_tr, X_te, y_te, min_max_scaler_Y, model_name="RandomForestRegressor")

  return fit_method(estimator, *args, **kwargs)


Evaluating RandomForestRegressor
MAE: 455.9947120995119
MSE: 751.0454963440693

Y_test:
[ 27.3  10.5  40.6  43.8  62.7  65.7  97.2  92.  110.1 143.5]
Y_pred:
[308.58380774 308.58380774 308.58380774 308.58380774 308.58380774
 308.58380774 308.58380774 308.58380774 308.58380774 308.58380774]


## Stacking

In [59]:
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import StackingRegressor

In [60]:
# Train and evaluate StackingRegressor model
estimators = [('lr', RidgeCV()), ('svr', LinearSVR(dual="auto", random_state=42))]
reg = StackingRegressor(estimators=estimators,
                        final_estimator=RandomForestRegressor(n_estimators=10, random_state=42))
evaluate_model(reg, X_tr, y_tr, X_te, y_te, min_max_scaler_Y, model_name="StackingRegressor")

  y = column_or_1d(y, warn=True)


Evaluating StackingRegressor
MAE: 22.143143083224654
MSE: 67.15826408618388

Y_test:
[ 27.3  10.5  40.6  43.8  62.7  65.7  97.2  92.  110.1 143.5]
Y_pred:
[ 13.62274656  13.62274656  13.62274656  13.62274656  13.62274656
  13.62274656  72.25065755  23.04662252  85.643296   143.14166667]


In [61]:
#check to see if my rescaling is working as intended

# Test scaling and rescaling
sample_y = [y_scaled[0], y_scaled[10], y_scaled[100]]
print("Sample scaled values:", sample_y)

rescaled_y = rescale_data(min_max_scaler_Y, sample_y)
print("Rescaled values:", rescaled_y)

original_y = [Y_list[0], Y_list[10], Y_list[100]]
print("Original values:", original_y)

# Check if rescaled values match the original values
print("Match:", np.allclose(rescaled_y, original_y))

Sample scaled values: [array([0.]), array([4.20007403e-07]), array([4.44713721e-06])]
Rescaled values: [  0.  17. 180.]
Original values: [0.0, 17.0, 180.0]
Match: True


# TEST

In [62]:
# ns_list, train_scores, validation_scores = learning_curve(
#                                                    estimator = SVR(**opt_svr_param), 
#                                                    X = X, y = Y, 
#                                                    train_sizes = ns_list, cv = 5,
#                                                    scoring = 'neg_mean_squared_error')


In [63]:
# train_scores, valid_scores = validation_curve(KNeighborsRegressor(), X, Y, 
#                                               param_name="n_neighbors",
#                                               param_range=k_list , cv=20, 
#                                               scoring = 'neg_mean_squared_error',
#                                               verbose=1, n_jobs=-1
#                                              )
