In [30]:
!pip install xgboost



In [31]:
import pandas as pd
# import numpy as np

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor

import xgboost as xgb

import os

In [32]:
!pip install ucimlrepo



In [33]:
# Disable chained assignment warning
pd.options.mode.chained_assignment = None  # default: 'warn'

In [34]:
# path to project: Edit here to point to your Drive if using colab
drive = '/content/drive/MyDrive/'
project_dir = 'Colab Notebooks/IronHacks2024_Data_Analytics/DA_Week_7_ML/'
COLAB_PROJECT_PATH = drive+project_dir+'ML_Project/Hoang_Changes/'

In [35]:
using_colab = False
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    # Mount Google Drive
    drive.mount('/content/drive', force_remount=True)
    using_colab = True

if using_colab and os.path.exists(COLAB_PROJECT_PATH):
    print('found colab path; redirecting to main project directory')
    os.chdir(COLAB_PROJECT_PATH)

Mounted at /content/drive
found colab path; redirecting to main project directory


In [36]:
# Vars representing folders to store our Metrics:
METRICS_PICKLE_DF = './Generated_Files'
FINAL_RESULTS = './Models_Results'

In [37]:
def create_folder_if_not_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print("Folder created:", folder_path)

create_folder_if_not_exists(METRICS_PICKLE_DF) # folder for storing metrics as pickle files
create_folder_if_not_exists(FINAL_RESULTS) # folder for storing all Models metrics

Folder created: ./Generated_Files
Folder created: ./Models_Results


In [38]:
from data_cleaning import fetch_and_clean_data

tetouan = fetch_and_clean_data()

In [39]:
tetouan['Month'].value_counts()

Month
1     4464
3     4464
5     4464
7     4464
8     4464
10    4464
4     4320
6     4320
9     4320
11    4320
12    4320
2     4032
Name: count, dtype: int64

In [40]:
tetouan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52416 entries, 0 to 52415
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Month                  52416 non-null  int32  
 1   Monday                 52416 non-null  int64  
 2   Tuesday                52416 non-null  int64  
 3   Wednesday              52416 non-null  int64  
 4   Thursday               52416 non-null  int64  
 5   Friday                 52416 non-null  int64  
 6   Saturday               52416 non-null  int64  
 7   Sunday                 52416 non-null  int64  
 8   Morning                52416 non-null  int64  
 9   Afternoon              52416 non-null  int64  
 10  Evening                52416 non-null  int64  
 11  Night                  52416 non-null  int64  
 12  Temp                   52416 non-null  float64
 13  Humidity               52416 non-null  float64
 14  Wind_Speed             52416 non-null  float64
 15  ge

In [41]:
features = tetouan.drop(columns = ["Zone_1_PC","Zone_2_PC","Zone_3_PC"])
target = tetouan[["Zone_1_PC","Zone_2_PC","Zone_3_PC"]]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [79]:
# scale feature data
def scale_data(train_feature_data, test_feature_data, normalizer):
    normalizer.fit(train_feature_data)

    train_norm_arr = normalizer.transform(train_feature_data)
    test_norm_arr = normalizer.transform(test_feature_data)

    train_feature_data_norm = pd.DataFrame(train_norm_arr, columns = train_feature_data.columns)
    test_feature_data_norm = pd.DataFrame(test_norm_arr, columns = test_feature_data.columns)

    return train_feature_data_norm, test_feature_data_norm

#  Eval model function
def eval_model(model, X_test_data, y_test_labels, get_metrics:bool=False):
    pred = model.predict(X_test_data)

    mae = mean_absolute_error(pred, y_test_labels)
    rmse = mean_squared_error(pred, y_test_labels, squared=False)
    r2 = model.score(X_test_data, y_test_labels)

    print("MAE: ", mae)
    print("RMSE: ", rmse)
    print("R2 Score: ", r2)
    print('-'*45)

    if get_metrics:
        return mae, rmse, r2

In [80]:
def test_model(model, train_data, train_labels, test_data, test_labels):
    # fit model to data
    model.fit(train_data, train_labels)
    # Test performance
    eval_model(model, test_data, test_labels)

def store_pickle(df, file_name):
    pickle_file = os.path.join(METRICS_PICKLE_DF, f'{file_name}.pkl')
    df.to_pickle(pickle_file)

def model_metrics(model, train_data, train_labels, test_data, test_labels, model_name:str='Model'):
    model.fit(train_data, train_labels)
    # Test performance
    mae, rmse, r2 = eval_model(model, test_data, test_labels, get_metrics=True)

    # Perform cross-validation and calculate mean cross-validated MAE
    cv_mae = -cross_val_score(model, train_data, train_labels, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1).mean()

    # Get dataframe
    metrics_df = pd.DataFrame({
    'Model': model_name,
    'MAE': mae,
    'RMSE': rmse,
    'R2 Score': r2,
    'Cross-validated MAE': cv_mae
    }, index = [0])

    # store dataframe onto pickle file:
    store_pickle(metrics_df, model_name)

    return metrics_df

In [81]:
def store_all_results(pickle_path:str=None):
    if not pickle_path:
        print('No files to store: Please enter file dir and/or files')
        return

    assert os.path.isdir(pickle_path), f"Folder {pickle_path} doesn't exist"

    pkl_files = [os.path.join(pickle_path, pkl_file) for pkl_file in os.listdir(pickle_path)]

    all_metrics = [pd.read_pickle(pkl_file) for pkl_file in pkl_files]

    result_metrics = pd.concat(all_metrics, ignore_index=True)

    csv_file = os.path.join(FINAL_RESULTS, 'ML_Models_Metrics.csv')
    result_metrics.to_csv(csv_file, index=False)

In [82]:
scaler = MinMaxScaler()
X_train_scaled, X_test_scaled = scale_data(X_train, X_test, normalizer=scaler)

X_train_scaled.head()

Unnamed: 0,Month,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Morning,Afternoon,Evening,Night,Temp,Humidity,Wind_Speed,general diffuse flows,diffuse flows
0,0.909091,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.322961,0.901749,0.003575,3.1e-05,8.8e-05
1,0.363636,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.583549,0.382219,0.004974,0.77386,0.069434
2,0.272727,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.340641,0.94728,0.003575,4.7e-05,0.000135
3,0.363636,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.483448,0.736401,0.003109,0.653481,0.304265
4,0.818182,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.404836,0.895758,0.756257,4.6e-05,9.1e-05


In [83]:
X_test_scaled.head()

Unnamed: 0,Month,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday,Morning,Afternoon,Evening,Night,Temp,Humidity,Wind_Speed,general diffuse flows,diffuse flows
0,0.454545,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.468215,0.938893,0.756412,4.4e-05,0.000139
1,0.818182,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.471751,0.690391,0.757034,6.3e-05,0.000111
2,0.727273,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.646384,0.424635,0.757345,0.674118,0.070983
3,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.591709,0.60496,0.755324,0.51453,0.106507
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.17961,0.712437,0.00513,4e-05,0.000111


In [84]:
# KNN Regression with 10 Neighbors
knn_regress = KNeighborsRegressor(n_neighbors=10)

# K Nearest Neighbors Test for unscaled data
test_model(knn_regress, X_train, y_train, X_test, y_test)
# K Nearest Neighbors Test for scale data
model_metrics(knn_regress, X_train_scaled, y_train, X_test_scaled, y_test, 'KNNeighborsRegressor')

MAE:  2432.8949565880557
RMSE:  3439.233583706278
R2 Score:  0.702652544343792
---------------------------------------------
MAE:  1202.9575022402605
RMSE:  1876.7414229397734
R2 Score:  0.9113606511292204
---------------------------------------------


Unnamed: 0,Model,MAE,RMSE,R2 Score,Cross-validated MAE
0,KNNeighborsRegressor,1202.957502,1876.741423,0.911361,1377.008317


In [85]:
# linear regression
lin_reg = LinearRegression() # define linear regression model
model_metrics(lin_reg, X_train_scaled, y_train, X_test_scaled, y_test, 'Linear_Regression')

# This R2 of 0.50 is way lower than our K Nearest Neighbors model strength, so this is not the way to go.

MAE:  3540.9856711677785
RMSE:  4422.782751901611
R2 Score:  0.5095904667571047
---------------------------------------------


Unnamed: 0,Model,MAE,RMSE,R2 Score,Cross-validated MAE
0,Linear_Regression,3540.985671,4422.782752,0.50959,3541.723772


In [86]:
# Decision Tree
tree = DecisionTreeRegressor(max_depth=10)
model_metrics(tree, X_train_scaled, y_train, X_test_scaled, y_test, "DecisionTreeRegressor")

# 0.77 is better than the linear regression, but still not nearly as good as K Nearest Neighbors

MAE:  2049.366812364697
RMSE:  2964.658400906108
R2 Score:  0.7755605695105464
---------------------------------------------


Unnamed: 0,Model,MAE,RMSE,R2 Score,Cross-validated MAE
0,DecisionTreeRegressor,2049.366812,2964.658401,0.775561,2069.523736


## Ensemble Methods

In [87]:
# Bagging and Pasting

# Define Model
bagging_reg = BaggingRegressor(DecisionTreeRegressor(max_depth=20),
                               n_estimators=100,
                               max_samples = 1000)

model_metrics(bagging_reg, X_train_scaled, y_train, X_test_scaled, y_test,"BaggingRegressor")

# Slightly less accurate than Decision Tree

MAE:  2143.814550009894
RMSE:  2977.950667973093
R2 Score:  0.7736112926975592
---------------------------------------------


Unnamed: 0,Model,MAE,RMSE,R2 Score,Cross-validated MAE
0,BaggingRegressor,2143.81455,2977.950668,0.773611,2146.352055


In [88]:
# Also df output + Cross-validated MAE

# Initialize Random Forest
forest = RandomForestRegressor(n_estimators=100, max_depth=20)
model_metrics(forest, X_train_scaled, y_train, X_test_scaled, y_test, model_name='RandomForestRegressor')

# # Best Model yet - 0.920037

MAE:  1068.664135363537
RMSE:  1775.3895354555796
R2 Score:  0.9194194954757228
---------------------------------------------




Unnamed: 0,Model,MAE,RMSE,R2 Score,Cross-validated MAE
0,RandomForestRegressor,1068.664135,1775.389535,0.919419,1160.940767


In [94]:
# AdaBoost
# Initialize AdaBoost model
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20),
                            n_estimators=100)

# AdaBoost Model needs to have just one regressor variable to look at, not 3
# Create subsets
y_train_1 = y_train.drop(columns=['Zone_2_PC','Zone_3_PC'])
y_train_2 = y_train.drop(columns=['Zone_1_PC','Zone_3_PC'])
y_train_3 = y_train.drop(columns=['Zone_1_PC','Zone_2_PC'])
y_test_1 = y_test.drop(columns=['Zone_2_PC','Zone_3_PC'])
y_test_2 = y_test.drop(columns=['Zone_1_PC','Zone_3_PC'])
y_test_3 = y_test.drop(columns=['Zone_1_PC','Zone_2_PC'])

print('AdaBoost Model 1:')
ada_1_df = model_metrics(ada_reg, X_train_scaled, y_train_1, X_test_scaled, y_test_1, model_name='AdaBoost_Model_1')
print('AdaBoost Model 2:')
ada_2_df = model_metrics(ada_reg, X_train_scaled, y_train_2, X_test_scaled, y_test_2, model_name='AdaBoost_Model_2')
print('AdaBoost Model 3:')
ada_3_df = model_metrics(ada_reg, X_train_scaled, y_train_3, X_test_scaled, y_test_3, model_name='AdaBoost_Model_3')

print(ada_1_df)
print(ada_2_df)
ada_3_df
# Model 1 R2 is 0.9277, best yet.
# Model 2 R2 is 0.9418, best again.
# Model 3 R2 is 0.9654, highest yet.

AdaBoost Model 1:


  y = column_or_1d(y, warn=True)


MAE:  1105.4661638464183
RMSE:  1922.8818516236524
R2 Score:  0.9277873901437182
---------------------------------------------
AdaBoost Model 2:


  y = column_or_1d(y, warn=True)


MAE:  700.8165470844125
RMSE:  1232.6526229824879
R2 Score:  0.9436680461340283
---------------------------------------------
AdaBoost Model 3:


  y = column_or_1d(y, warn=True)


MAE:  700.3799967318384
RMSE:  1255.9049301211403
R2 Score:  0.9648511312169551
---------------------------------------------
              Model          MAE         RMSE  R2 Score  Cross-validated MAE
0  AdaBoost_Model_1  1105.466164  1922.881852  0.927787          1214.489074
              Model         MAE         RMSE  R2 Score  Cross-validated MAE
0  AdaBoost_Model_2  700.816547  1232.652623  0.943668           787.559385


Unnamed: 0,Model,MAE,RMSE,R2 Score,Cross-validated MAE
0,AdaBoost_Model_3,700.379997,1255.90493,0.964851,769.569973


In [95]:
# Less code for AdaBoost + df output

# Initialize AdaBoost model
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=20), n_estimators=100)

# Create subsets
zones = ['Zone_1_PC', 'Zone_2_PC', 'Zone_3_PC']
# y_train_1 = y_train.drop(columns=['Zone_2_PC','Zone_3_PC'])
y_train_subsets = [y_train.drop(columns=[zone for zone in zones if zone != target]) for target in zones]
y_test_subsets = [y_test.drop(columns=[zone for zone in zones if zone != target]) for target in zones]

# DataFrame to store results
results_df = pd.DataFrame(columns=[
    'Model', 'MAE_Zone_1', 'MAE_Zone_2', 'MAE_Zone_3',
    'RMSE_Zone_1', 'RMSE_Zone_2', 'RMSE_Zone_3', 'R2 Score_Zone_1',
    'R2 Score_Zone_2', 'R2 Score_Zone_3', 'Cross-validated R2', 'Cross-validated MAE'
])

# Initialize lists to store results
mae_list, rmse_list, r2_list = [], [], []

# Train and evaluate models
for i, (y_train_subset, y_test_subset) in enumerate(zip(y_train_subsets, y_test_subsets)):
    ada_reg.fit(X_train_scaled, y_train_subset)


    mae, rmse, r2 = eval_model(ada_reg, X_test_scaled, y_test_subset, get_metrics=True)

    # pred = ada_reg.predict(X_test_scaled)
    # mae = mean_absolute_error(pred, y_test_subset)
    # rmse = mean_squared_error(pred, y_test_subset, squared=False)
    # r2 = ada_reg.score(X_test_scaled, y_test_subset)

    mae_list.append(mae)
    rmse_list.append(rmse)
    r2_list.append(r2)

# Calculate Cross-validated MAE as the mean of MAEs
cross_validated_mae = sum(mae_list) / len(mae_list)
# Calculate Cross-validated R2 as the mean of R2s
cross_validated_R2 = sum(r2_list) / len(r2_list)

# Add results to DataFrame
results_df = pd.concat([results_df, pd.DataFrame({
    'Model': ['AdaBoostRegressor'],
    'MAE_Zone_1': [mae_list[0]], 'MAE_Zone_2': [mae_list[1]], 'MAE_Zone_3': [mae_list[2]],
    'RMSE_Zone_1': [rmse_list[0]], 'RMSE_Zone_2': [rmse_list[1]], 'RMSE_Zone_3': [rmse_list[2]],
    'R2 Score_Zone_1': [r2_list[0]], 'R2 Score_Zone_2': [r2_list[1]], 'R2 Score_Zone_3': [r2_list[2]],
    'Cross-validated R2': [cross_validated_R2],
    'Cross-validated MAE': [cross_validated_mae]
})], ignore_index=True)

store_pickle(results_df, 'AdaBoostRegressor_Zones_CV')

results_df

  y = column_or_1d(y, warn=True)


MAE:  1122.0634396795474
RMSE:  1952.0029246608165
R2 Score:  0.9255835808683526
---------------------------------------------


  y = column_or_1d(y, warn=True)


MAE:  693.9851023325034
RMSE:  1228.8636683718553
R2 Score:  0.9440138226736401
---------------------------------------------


  y = column_or_1d(y, warn=True)


MAE:  698.2403965741727
RMSE:  1262.609029340991
R2 Score:  0.964474875924572
---------------------------------------------


Unnamed: 0,Model,MAE_Zone_1,MAE_Zone_2,MAE_Zone_3,RMSE_Zone_1,RMSE_Zone_2,RMSE_Zone_3,R2 Score_Zone_1,R2 Score_Zone_2,R2 Score_Zone_3,Cross-validated R2,Cross-validated MAE
0,AdaBoostRegressor,1122.06344,693.985102,698.240397,1952.002925,1228.863668,1262.609029,0.925584,0.944014,0.964475,0.944691,838.096313


In [91]:
# Gradient Boosting
# ChatGPT says this could take from 1 to 3 hours to run. Fun!

# Import package
from sklearn.multioutput import MultiOutputRegressor

# Initialize model
gb_reg = GradientBoostingRegressor(max_depth=20,
                                   n_estimators=100)

# Set up a multi-output regressor since our target has multiple columns
mor = MultiOutputRegressor(gb_reg)

# Train Model
mor.fit(X_train_scaled, y_train)

# Set Predictor
pred = mor.predict(X_test_scaled)

# Evaluate Model
mae = mean_absolute_error(pred, y_test, multioutput='raw_values')
rmse = mean_squared_error(pred, y_test, multioutput='raw_values', squared=False)
r2 = mor.score(X_test_scaled, y_test)

# Perform cross-validation
scores = cross_val_score(mor, X_train_scaled, y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Model': "GradientBoostingRegressor",
    'MAE_Zone_1': [mae[0]],
    'MAE_Zone_2': [mae[1]],
    'MAE_Zone_3': [mae[2]],
    'RMSE_Zone_1': [rmse[0]],
    'RMSE_Zone_2': [rmse[1]],
    'RMSE_Zone_3': [rmse[2]],
    'R2 Score': [r2],
    'Cross-validated MAE': [-scores.mean()]
})

store_pickle(metrics_df, 'GradientBoostingRegressor_Zones_CV')

metrics_df
# I have no idea why it took Rosemary 30 minutes to run this, cus it ran for me in 2 minutes.
# R2 of 0.9029 puts this in like 3rd place overall.



Unnamed: 0,Model,MAE_Zone_1,MAE_Zone_2,MAE_Zone_3,RMSE_Zone_1,RMSE_Zone_2,RMSE_Zone_3,R2 Score,Cross-validated MAE
0,GradientBoostingRegressor,1389.559768,918.596911,901.502442,2520.995403,1673.309492,1694.575504,0.902693,1181.338469


In [92]:
# Initialize XGBoost model
xgb_reg = xgb.XGBRegressor(max_depth=20, n_estimators=100)

# Set up a multi-output regressor
mor = MultiOutputRegressor(xgb_reg)

# Train Model
mor.fit(X_train_scaled, y_train)

# Set Predictor
pred = mor.predict(X_test_scaled)

# Evaluate Model
mae = mean_absolute_error(pred, y_test, multioutput='raw_values')
rmse = mean_squared_error(pred, y_test, multioutput='raw_values', squared=False)
r2 = mor.score(X_test_scaled, y_test)

# Perform cross-validation
scores = cross_val_score(mor, X_train_scaled, y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Model': "XGBRegressor",
    'MAE_Zone_1': [mae[0]],
    'MAE_Zone_2': [mae[1]],
    'MAE_Zone_3': [mae[2]],
    'RMSE_Zone_1': [rmse[0]],
    'RMSE_Zone_2': [rmse[1]],
    'RMSE_Zone_3': [rmse[2]],
    'R2 Score': [r2],
    'Cross-validated MAE': [-scores.mean()]
})

store_pickle(metrics_df, 'XGBRegressor_Zones_CV')

metrics_df
# Cross-validated MAE 1085.601088 meaning generally prediction would be 1085.601088 unit from the actual PC, considering ~20k actual



Unnamed: 0,Model,MAE_Zone_1,MAE_Zone_2,MAE_Zone_3,RMSE_Zone_1,RMSE_Zone_2,RMSE_Zone_3,R2 Score,Cross-validated MAE
0,XGBRegressor,1281.968658,860.88221,825.08289,2223.664733,1490.362046,1449.666319,0.92475,1085.601088


In [96]:
store_all_results(METRICS_PICKLE_DF)