In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from datetime import datetime, timedelta
import pandas as pd

# Load the monthly data
data = pd.read_csv('/home/jojo/Windows Shared Folder/Monthly_covid19deathsCanada.csv')

# Sort the data by date in case it's not already sorted
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values('date')

# Create a new column for the target variable (sum of deaths two months ahead)
data['target_deaths'] = data['total_deaths'].shift(-2)

# Drop rows with NaN values in the target_deaths column
data.dropna(subset=['target_deaths'], inplace=True)

# Features and target variable
features = ['total_deaths', 'total_tests', 'stringency_index']
target = 'target_deaths'

# Filter data from March 2022
data_from_march_2022 = data[data['date'] >= datetime(2022, 3, 1)]

# Get the date two months ahead from March 2022
future_date = datetime(2022, 3, 1) + timedelta(days=61)

# Get features for March 2022
features_march_2022 = data_from_march_2022[data_from_march_2022['date'] == datetime(2022, 3, 1)][features]

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
data_from_march_2022_imputed = imputer.fit_transform(data_from_march_2022[features])

# Create and fit the linear regression model with imputed data
model = LinearRegression()
model.fit(data_from_march_2022_imputed, data_from_march_2022[target])

# Predict for two months ahead from March 2022 using imputed features
future_predictions = model.predict(imputer.transform(features_march_2022))

# Get the date two months ahead in string format
future_date_string = future_date.strftime('%B %Y')

# Print the predicted deaths for the future date
print(f"Predicted deaths for {future_date_string}: {future_predictions[0]}")


Predicted deaths for May 2022: 1229514.4436394877


In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from datetime import timedelta

# Load the monthly data
data = pd.read_csv('/home/jojo/Windows Shared Folder/Monthly_covid19deathsCanada.csv')

# Sort the data by date in case it's not already sorted
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values('date')

# Create a new column for the target variable (sum of deaths two months ahead)
data['target_deaths'] = data['total_deaths'].shift(-2)

# Drop rows with NaN values in the target_deaths column
data.dropna(subset=['target_deaths'], inplace=True)

# Features and target variable
features = ['total_deaths', 'total_tests', 'stringency_index']
target = 'target_deaths'

# Filter data from March 2022
data_from_march_2022 = data[data['date'] >= datetime(2022, 3, 1)]

# Get the date two months ahead from March 2022
future_date = datetime(2022, 3, 1) + timedelta(days=61)

# Get features for March 2022
features_march_2022 = data_from_march_2022[data_from_march_2022['date'] == datetime(2022, 3, 1)][features]

# Create and fit the decision tree regressor model
model = DecisionTreeRegressor(random_state=42)
model.fit(data_from_march_2022[features], data_from_march_2022[target])

# Predict for two months ahead from March 2022
future_predictions = model.predict(features_march_2022)

# Get the date two months ahead in string format
future_date_string = future_date.strftime('%B %Y')

# Print the prediction for two months ahead from March 2022
print(f"For {future_date_string}, Predicted sum of deaths two months ahead: {future_predictions[0]}")
# Iterate through each month in the test set
test_months = data['date'].unique()


For May 2022, Predicted sum of deaths two months ahead: 1315659.4285714286


In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, accuracy_score
from sklearn.impute import SimpleImputer
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

# Load the monthly data
data = pd.read_csv('/home/jojo/Windows Shared Folder/Monthly_covid19deathsCanada.csv')

# Sort the data by date in case it's not already sorted
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values('date')

# Create a new column for the target variable (sum of deaths two months ahead)
data['target_deaths'] = data['total_deaths'].shift(-2)

# Drop rows with NaN values in the target_deaths column
data.dropna(subset=['target_deaths'], inplace=True)

# Features and target variable
features = ['total_deaths', 'total_tests', 'stringency_index']
target = 'target_deaths'

# Filter data from March 2022
data_from_march_2022 = data[data['date'] >= datetime(2022, 3, 1)]

# Split the data into training and testing sets
train_data = data_from_march_2022[data_from_march_2022['date'] < datetime(2023, 9, 1)].copy()
test_data = data_from_march_2022[data_from_march_2022['date'] >= datetime(2023, 9, 1)].copy()

# Impute missing values in the training set only
imputer = SimpleImputer(strategy='mean')
train_data.loc[:, features] = imputer.fit_transform(train_data[features])
test_data.loc[:, features] = imputer.transform(test_data[features])

# Create and fit the linear regression model with training data
model = LinearRegression()
model.fit(train_data[features], train_data[target])

# Predict for the test set
test_predictions = model.predict(test_data[features])

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(test_data[target], test_predictions)
print(f"Mean Absolute Error: {mae}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(test_data[target], test_predictions)
print(f"Mean Squared Error: {mse}")

# Calculate Mean Absolute Percentage Error (MAPE)
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(test_data[target], test_predictions)
print(f"Mean Absolute Percentage Error: {mape}")

# Convert predictions to binary values for classification (assuming a threshold)
threshold = 100  # Set your threshold value
predicted_classes = np.where(test_predictions > threshold, 1, 0)
actual_classes = np.where(test_data[target] > threshold, 1, 0)

# Calculate F1 Score
f1 = f1_score(actual_classes, predicted_classes)
print(f"F1 Score: {f1}")

# Calculate Accuracy
accuracy = accuracy_score(actual_classes, predicted_classes)
print(f"Accuracy: {accuracy}")



Mean Absolute Error: 766302.2532631285
Mean Squared Error: 587219143356.1478
Mean Absolute Percentage Error: 89.86226397160351
F1 Score: 1.0
Accuracy: 1.0


In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error, f1_score, accuracy_score
import numpy as np

# Load the monthly data
data = pd.read_csv('/home/jojo/Windows Shared Folder/Monthly_covid19deathsCanada.csv')

# Sort the data by date in case it's not already sorted
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values('date')

# Create a new column for the target variable (sum of deaths two months ahead)
data['target_deaths'] = data['total_deaths'].shift(-2)

# Drop rows with NaN values in the target_deaths column
data.dropna(subset=['target_deaths'], inplace=True)

# Features and target variable
features = ['total_deaths', 'total_tests', 'stringency_index']
target = 'target_deaths'

# Filter data from March 2022
data_from_march_2022 = data[data['date'] >= datetime(2022, 3, 1)]

# Get the date two months ahead from March 2022
future_date = datetime(2022, 3, 1) + timedelta(days=61)

# Get features for March 2022
features_march_2022 = data_from_march_2022[data_from_march_2022['date'] == datetime(2022, 3, 1)][features]

# Create and fit the decision tree regressor model
model = DecisionTreeRegressor(random_state=42)
model.fit(data_from_march_2022[features], data_from_march_2022[target])

# Predict for two months ahead from March 2022
future_predictions = model.predict(features_march_2022)

# Get the date two months ahead in string format
future_date_string = future_date.strftime('%B %Y')

# Print the prediction for two months ahead from March 2022
print(f"For {future_date_string}, Predicted sum of deaths two months ahead: {future_predictions[0]}")

# Now, let's calculate the evaluation metrics using the model

# Get test data for evaluation
test_data = data[data['date'] >= future_date]
test_data = test_data[test_data['date'] < future_date + timedelta(days=30)]  # Considering a one-month window

# Predictions for the test data
test_predictions = model.predict(test_data[features])

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(test_data[target], test_predictions)
print(f"Mean Absolute Error: {mae}")

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(test_data[target], test_predictions)
print(f"Mean Squared Error: {mse}")

# Calculate Mean Absolute Percentage Error (MAPE)
def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

mape = mean_absolute_percentage_error(test_data[target], test_predictions)
print(f"Mean Absolute Percentage Error: {mape}")

# Convert predictions to binary values for classification (assuming a threshold)
threshold = 100  # Set your threshold value
predicted_classes = np.where(test_predictions > threshold, 1, 0)
actual_classes = np.where(test_data[target] > threshold, 1, 0)

# Calculate F1 Score
f1 = f1_score(actual_classes, predicted_classes)
print(f"F1 Score: {f1}")

# Calculate Accuracy
accuracy = accuracy_score(actual_classes, predicted_classes)
print(f"Accuracy: {accuracy}")


For May 2022, Predicted sum of deaths two months ahead: 1315659.4285714286
Mean Absolute Error: 26459.428571428638
Mean Squared Error: 700101360.3265342
Mean Absolute Percentage Error: 2.052391294712119
F1 Score: 1.0
Accuracy: 1.0
