In [None]:
# %%

# import labraries in the process if need 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# %%

# read data 
data = pd.read_excel("fp-historical-wildfire-data-2006-2023.xlsx")
data.head()

Unnamed: 0,fire_year,fire_number,fire_name,current_size,size_class,fire_location_latitude,fire_location_longitude,fire_origin,general_cause_desc,industry_identifier_desc,...,distance_from_water_source,first_bucket_drop_date,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
0,2006,PWF001,,0.1,A,56.249956,-117.18196,Private Land,Resident,,...,,,2006-04-02 22:00:00,0.01,2006-04-02 22:00:00,0.01,,,2006-04-03 10:20:00,0.1
1,2006,EWF002,,0.2,B,53.606367,-115.915733,Provincial Land,Incendiary,,...,,,2006-04-03 13:20:00,0.2,2006-04-03 13:20:00,0.2,,,2006-04-03 14:00:00,0.2
2,2006,EWF001,,0.5,B,53.610933,-115.594267,Provincial Land,Incendiary,,...,,,2006-04-03 13:23:00,0.5,2006-04-03 13:23:00,0.5,,,2006-04-03 15:00:00,0.5
3,2006,EWF003,,0.01,A,53.608867,-115.609467,Provincial Land,Incendiary,,...,,,2006-04-03 14:08:00,0.01,2006-04-03 14:08:00,0.01,,,2006-04-03 15:05:00,0.01
4,2006,PWF002,,0.1,A,56.249956,-117.050249,Provincial Land,Other Industry,Waste Disposal,...,,,2006-04-03 19:57:00,0.1,2006-04-03 20:19:00,0.1,2006-04-03 20:20:00,0.1,2006-04-05 10:18:00,0.1


In [None]:
# %%

# data preparation
# Create a mapping table for forest_area
mapping = {
    'C': 'Calgary',
    'E': 'Edson',
    'G': 'Grande Prairie',
    'H': 'High Level',
    'L': 'Lac La Biche',
    'M': 'Fort McMurray',
    'P': 'Peace River',
    'R': 'Rocky Mountain House',
    'S': 'Slave Lake',
    'W': 'Whitecourt'
}
# Convert the dictionary to a DataFrame
mapping_df = pd.DataFrame(list(mapping.items()), columns=['initial', 'forest_area_full'])
# Extract the first letter of fire_number to create forest_area
data['forest_area'] = data['fire_number'].str[0]
# Merge the original data with the mapping DataFrame
data = data.merge(mapping_df, left_on='forest_area', right_on='initial', how='left')
# Replace the forest_area with the full name
data['forest_area'] = data['forest_area_full']
# Drop the temporary columns used for merging
data.drop(columns=['initial', 'forest_area_full'], inplace=True)

data['reported_date'] = pd.to_datetime(data['reported_date'], errors='coerce')
data['reported_month'] = data['reported_date'].dt.month

print(data['forest_area'].head())
print(data[['reported_date', 'reported_month']].head())

0    Peace River
1          Edson
2          Edson
3          Edson
4    Peace River
Name: forest_area, dtype: object
        reported_date  reported_month
0 2006-04-02 20:46:00               4
1 2006-04-03 12:27:00               4
2 2006-04-03 12:36:00               4
3 2006-04-03 13:23:00               4
4 2006-04-03 19:12:00               4


In [None]:
# %%

# select features
selected_columns = [
    'reported_month', 'forest_area', 'fire_location_latitude', 'fire_location_longitude',
    'fire_origin','general_cause_desc', 'weather_conditions_over_fire', 
    'temperature', 'relative_humidity', 'wind_direction',
    'wind_speed', 'size_class'
]

data_relevant = data[selected_columns]

In [None]:
# %%

# Data cleansing
numerical_columns = data_relevant.select_dtypes(include=[np.number]).columns.tolist()
# numerical_columns .remove('fire_location_latitude')
categorical_columns = data_relevant.select_dtypes(include=[object]).columns.tolist()
# categorical_columns.remove('size_class')

# Fill null value by median for numerical_columns
imputer = SimpleImputer(strategy='median')
data_relevant[numerical_columns] = imputer.fit_transform(data_relevant[numerical_columns])
# fill null value by using "missing" for categorical_columns
data_relevant[categorical_columns] = data_relevant[categorical_columns].fillna('missing')

# Create a feature combination
data_relevant['forest_cause_combined'] = data_relevant['forest_area'] + '_' + data_relevant['general_cause_desc']
# Append to categorical_columns
categorical_columns.append('forest_cause_combined')

# check the result
print(data_relevant.isnull().sum())

reported_month                  0
forest_area                     0
fire_location_latitude          0
fire_location_longitude         0
fire_origin                     0
general_cause_desc              0
weather_conditions_over_fire    0
temperature                     0
relative_humidity               0
wind_direction                  0
wind_speed                      0
size_class                      0
forest_cause_combined           0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant[numerical_columns] = imputer.fit_transform(data_relevant[numerical_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant[categorical_columns] = data_relevant[categorical_columns].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant['forest_ca

In [None]:
#%% Check latitude_range in the dataset

# latitude_range = data['fire_location_latitude'].describe()
# print(latitude_range)
# Divide the latitude into groups
# def categorize_latitude(fire_location_latitude):
#     if fire_location_latitude > 56.78:
#         return 'high'
#     elif fire_location_latitude >= 53.19:
#         return 'mid'
#     else:
#         return 'low'

# data_relevant['latitude_category'] = data_relevant['fire_location_latitude'].apply(categorize_latitude)
# categorical_columns.append('latitude_category')

In [None]:
# %%

# Encoding categorical_columns
# data_encoded = pd.get_dummies(data_relevant, columns=categorical_columns)
# check
# print(data_encoded.head())
# Encode categorical features with LabelEncoder

from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    data_relevant[column] = le.fit_transform(data_relevant[column])
    label_encoders[column] = le

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant[column] = le.fit_transform(data_relevant[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant[column] = le.fit_transform(data_relevant[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant[column] = le.fit_transform(data_relevant[column])
A value is tr

In [None]:
# %%

from sklearn.preprocessing import StandardScaler
# numerical_columns standration 
scaler = StandardScaler()
data_relevant[numerical_columns] = scaler.fit_transform(data_relevant[numerical_columns])
# check
print(data_relevant.head())

   reported_month  forest_area  fire_location_latitude  \
0        -1.30896            6                0.484101   
1        -1.30896            1               -0.526989   
2        -1.30896            1               -0.525242   
3        -1.30896            1               -0.526033   
4        -1.30896            6                0.484101   

   fire_location_longitude  fire_origin  general_cause_desc  \
0                -0.843053            4                  11   
1                -0.299861            5                   3   
2                -0.161957            5                   3   
3                -0.168477            5                   3   
4                -0.786551            5                   6   

   weather_conditions_over_fire  temperature  relative_humidity  \
0                             2     0.002880          -1.965742   
1                             2    -0.831584          -1.287494   
2                             2    -0.831584          -1.287494   
3   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_relevant[numerical_columns] = scaler.fit_transform(data_relevant[numerical_columns])


In [None]:
# %%

# Separation of features and target variables
X = data_relevant.drop(columns=['size_class'])
y = data_relevant['size_class']

# Split to training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# using random forest classifier 
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Get the importance of features
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

print("Rank of Features:")
for i in range(X.shape[1]):
    print(f"{i + 1}. Feature {X.columns[indices[i]]} ({importances[indices[i]]})")

Rank of Features:
1. Feature fire_location_latitude (0.18383832155888782)
2. Feature fire_location_longitude (0.150362033488555)
3. Feature relative_humidity (0.12239827194180786)
4. Feature temperature (0.11009873984066249)
5. Feature wind_speed (0.08656205803791009)
6. Feature wind_direction (0.07687606207326275)
7. Feature forest_cause_combined (0.06109546976918299)
8. Feature reported_month (0.05841920878427568)
9. Feature weather_conditions_over_fire (0.04689692617491824)
10. Feature general_cause_desc (0.04588915061354143)
11. Feature forest_area (0.03417944365277759)
12. Feature fire_origin (0.02338431406421805)


In [None]:
# %%

# based on the rank from random forest, drop some features.
# keep top 11 features
# data_new=data_relevant.drop(columns=['fire_origin'])
# categorical_columns.remove('fire_origin')
# categorical_columns.remove('size_class')

# keep top 9 features
data_new=data_relevant.drop(columns=['fire_origin','general_cause_desc','forest_area'])
categorical_columns.remove('fire_origin')
categorical_columns.remove('general_cause_desc')
categorical_columns.remove('forest_area')
categorical_columns.remove('size_class')

print(data_new)
print(categorical_columns)

       reported_month  fire_location_latitude  fire_location_longitude  \
0           -1.308960                0.484101                -0.843053   
1           -1.308960               -0.526989                -0.299861   
2           -1.308960               -0.525242                -0.161957   
3           -1.308960               -0.526033                -0.168477   
4           -1.308960                0.484101                -0.786551   
...               ...                     ...                      ...   
25316       -1.308960                0.563232                -0.369242   
25317       -0.741053               -0.006889                -0.151911   
25318       -0.741053                0.140115                -0.816182   
25319        0.962667                0.456460                -0.678895   
25320        2.098481                0.762989                -0.904215   

       weather_conditions_over_fire  temperature  relative_humidity  \
0                                 2     

In [None]:
# %%

# pip install pycaret
from pycaret.classification import *
# setting
exp_clf = setup(data=data_new, target='size_class', 
                numeric_features=numerical_columns, 
                categorical_features=categorical_columns,
                session_id=123, 
                verbose=False)
# compre all the models
best_model = compare_models()
top3_models = compare_models(n_select=3)
second_best_model = top3_models[1]
third_best_model = top3_models[2]

# get the best model
print("Best Mode:",best_model)
print("Second Best Model:", second_best_model)
print("Third Best Model:", third_best_model)



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.6848,0.7816,0.6848,0.6297,0.6425,0.244,0.2605,0.862
xgboost,Extreme Gradient Boosting,0.6784,0.7773,0.6784,0.6295,0.6441,0.2502,0.26,0.682
rf,Random Forest Classifier,0.6779,0.7696,0.6779,0.6193,0.6313,0.2161,0.2334,0.931
gbc,Gradient Boosting Classifier,0.6768,0.0,0.6768,0.6032,0.6191,0.1896,0.2147,5.346
ada,Ada Boost Classifier,0.669,0.0,0.669,0.5878,0.6,0.1458,0.176,0.59
ridge,Ridge Classifier,0.6681,0.0,0.6681,0.5613,0.583,0.1036,0.1401,0.104
et,Extra Trees Classifier,0.6676,0.7487,0.6676,0.6093,0.6249,0.2019,0.2157,0.938
lr,Logistic Regression,0.6662,0.0,0.6662,0.5739,0.5958,0.1315,0.159,3.661
dummy,Dummy Classifier,0.6635,0.5,0.6635,0.4402,0.5293,0.0,0.0,0.108
lda,Linear Discriminant Analysis,0.6619,0.0,0.6619,0.5713,0.5969,0.1378,0.1608,0.144


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.6848,0.7816,0.6848,0.6297,0.6425,0.244,0.2605,0.909
xgboost,Extreme Gradient Boosting,0.6784,0.7773,0.6784,0.6295,0.6441,0.2502,0.26,0.75
rf,Random Forest Classifier,0.6779,0.7696,0.6779,0.6193,0.6313,0.2161,0.2334,0.942
gbc,Gradient Boosting Classifier,0.6768,0.0,0.6768,0.6032,0.6191,0.1896,0.2147,5.566
ada,Ada Boost Classifier,0.669,0.0,0.669,0.5878,0.6,0.1458,0.176,0.525
ridge,Ridge Classifier,0.6681,0.0,0.6681,0.5613,0.583,0.1036,0.1401,0.112
et,Extra Trees Classifier,0.6676,0.7487,0.6676,0.6093,0.6249,0.2019,0.2157,0.932
lr,Logistic Regression,0.6662,0.0,0.6662,0.5739,0.5958,0.1315,0.159,2.373
dummy,Dummy Classifier,0.6635,0.5,0.6635,0.4402,0.5293,0.0,0.0,0.101
lda,Linear Discriminant Analysis,0.6619,0.0,0.6619,0.5713,0.5969,0.1378,0.1608,0.148


Best Mode: LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)
Second Best Model: XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, m

In [None]:
# %%

# Tune the hyperparameters of the best model
tuned_model = tune_model(best_model)
# Evaluate the tuned model
evaluate_model(tuned_model)
# Evaluate the model with cross-validation
cv_results = pull()
print(cv_results)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6785,0.7807,0.6785,0.6129,0.6263,0.206,0.2274
1,0.6898,0.7839,0.6898,0.6145,0.6361,0.2283,0.2542
2,0.6813,0.7637,0.6813,0.6034,0.6302,0.2186,0.2383
3,0.6881,0.7823,0.6881,0.632,0.6427,0.2448,0.2632
4,0.6783,0.7765,0.6783,0.6079,0.622,0.1965,0.2201
5,0.6885,0.7833,0.6885,0.6389,0.6385,0.2339,0.2566
6,0.6778,0.7814,0.6778,0.6348,0.6276,0.2086,0.2281
7,0.6817,0.7747,0.6817,0.6145,0.6309,0.215,0.2363
8,0.6992,0.7988,0.6992,0.6344,0.6489,0.2581,0.2837
9,0.6817,0.7746,0.6817,0.6047,0.6277,0.2096,0.2326


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.6785  0.7807  0.6785  0.6129  0.6263  0.2060  0.2274
1       0.6898  0.7839  0.6898  0.6145  0.6361  0.2283  0.2542
2       0.6813  0.7637  0.6813  0.6034  0.6302  0.2186  0.2383
3       0.6881  0.7823  0.6881  0.6320  0.6427  0.2448  0.2632
4       0.6783  0.7765  0.6783  0.6079  0.6220  0.1965  0.2201
5       0.6885  0.7833  0.6885  0.6389  0.6385  0.2339  0.2566
6       0.6778  0.7814  0.6778  0.6348  0.6276  0.2086  0.2281
7       0.6817  0.7747  0.6817  0.6145  0.6309  0.2150  0.2363
8       0.6992  0.7988  0.6992  0.6344  0.6489  0.2581  0.2837
9       0.6817  0.7746  0.6817  0.6047  0.6277  0.2096  0.2326
Mean    0.6845  0.7800  0.6845  0.6198  0.6331  0.2219  0.2441
Std     0.0065  0.0085  0.0065  0.0130  0.0079  0.0182  0.0188


In [None]:
# %%

# Tune the hyperparameters of the second_best model
tuned_model = tune_model(second_best_model)
# Evaluate the tuned model
evaluate_model(tuned_model)
# Evaluate the model with cross-validation
cv_results = pull()
print(cv_results)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6858,0.7848,0.6858,0.6355,0.638,0.2301,0.2509
1,0.696,0.7893,0.696,0.6454,0.6505,0.2625,0.282
2,0.6785,0.766,0.6785,0.6118,0.6316,0.2196,0.2362
3,0.6802,0.7882,0.6802,0.6208,0.6369,0.2338,0.2478
4,0.6783,0.7799,0.6783,0.6136,0.6283,0.2101,0.2293
5,0.6868,0.7894,0.6868,0.6369,0.6422,0.241,0.2591
6,0.6721,0.7788,0.6721,0.6043,0.6258,0.2081,0.223
7,0.6812,0.7773,0.6812,0.6242,0.6369,0.229,0.2455
8,0.7009,0.801,0.7009,0.6418,0.6545,0.2714,0.2936
9,0.684,0.7812,0.684,0.6369,0.6399,0.2329,0.2507


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.6858  0.7848  0.6858  0.6355  0.6380  0.2301  0.2509
1       0.6960  0.7893  0.6960  0.6454  0.6505  0.2625  0.2820
2       0.6785  0.7660  0.6785  0.6118  0.6316  0.2196  0.2362
3       0.6802  0.7882  0.6802  0.6208  0.6369  0.2338  0.2478
4       0.6783  0.7799  0.6783  0.6136  0.6283  0.2101  0.2293
5       0.6868  0.7894  0.6868  0.6369  0.6422  0.2410  0.2591
6       0.6721  0.7788  0.6721  0.6043  0.6258  0.2081  0.2230
7       0.6812  0.7773  0.6812  0.6242  0.6369  0.2290  0.2455
8       0.7009  0.8010  0.7009  0.6418  0.6545  0.2714  0.2936
9       0.6840  0.7812  0.6840  0.6369  0.6399  0.2329  0.2507
Mean    0.6844  0.7836  0.6844  0.6271  0.6385  0.2338  0.2518
Std     0.0082  0.0088  0.0082  0.0134  0.0086  0.0194  0.0209


In [None]:
# %%

# Tune the hyperparameters of the third_best model
tuned_model = tune_model(third_best_model)
# Evaluate the tuned model
evaluate_model(tuned_model)
# Evaluate the model with cross-validation
cv_results = pull()
print(cv_results)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6712,0.724,0.6712,0.5592,0.5723,0.0872,0.1366
1,0.6678,0.7305,0.6678,0.5612,0.5796,0.0957,0.1343
2,0.6661,0.7007,0.6661,0.5739,0.5818,0.1004,0.1351
3,0.6644,0.7183,0.6644,0.5475,0.5699,0.0821,0.1199
4,0.6738,0.7281,0.6738,0.5696,0.5826,0.1061,0.1538
5,0.6789,0.7282,0.6789,0.5748,0.5887,0.1213,0.1735
6,0.6682,0.7321,0.6682,0.5532,0.569,0.0792,0.1236
7,0.6716,0.7101,0.6716,0.5779,0.5911,0.1198,0.1586
8,0.6699,0.7294,0.6699,0.5617,0.5703,0.077,0.1246
9,0.6716,0.7319,0.6716,0.572,0.5875,0.1108,0.1512


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.6712  0.7240  0.6712  0.5592  0.5723  0.0872  0.1366
1       0.6678  0.7305  0.6678  0.5612  0.5796  0.0957  0.1343
2       0.6661  0.7007  0.6661  0.5739  0.5818  0.1004  0.1351
3       0.6644  0.7183  0.6644  0.5475  0.5699  0.0821  0.1199
4       0.6738  0.7281  0.6738  0.5696  0.5826  0.1061  0.1538
5       0.6789  0.7282  0.6789  0.5748  0.5887  0.1213  0.1735
6       0.6682  0.7321  0.6682  0.5532  0.5690  0.0792  0.1236
7       0.6716  0.7101  0.6716  0.5779  0.5911  0.1198  0.1586
8       0.6699  0.7294  0.6699  0.5617  0.5703  0.0770  0.1246
9       0.6716  0.7319  0.6716  0.5720  0.5875  0.1108  0.1512
Mean    0.6703  0.7233  0.6703  0.5651  0.5793  0.0980  0.1411
Std     0.0039  0.0100  0.0039  0.0096  0.0080  0.0156  0.0166


In [None]:
# %%

# Best parameters for lightgbm

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
# Define the model
model = LGBMClassifier(random_state=2024)
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
# Fit the model
grid_search.fit(X_train, y_train)
# Get the best estimator
best_model = grid_search.best_estimator_
# Predict on the test set
y_pred = best_model.predict(X_test)
# Evaluate the model
print(classification_report(y_test, y_pred))
# Print the best parameters
print("Best parameters for lighgbm found: ", grid_search.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002689 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1066
[LightGBM] [Info] Number of data points in the train set: 20256, number of used features: 12
[LightGBM] [Info] Start training from score -0.410513
[LightGBM] [Info] Start training from score -1.351748
[LightGBM] [Info] Start training from score -3.052403
[LightGBM] [Info] Start training from score -4.256724
[LightGBM] [Info] Start training from score -4.105065
              precision    recall  f1-score   support

           0       0.74      0.91      0.81      3365
           1       0.43      0.29      0.35      1282
           2       0.36      0.02      0.03       261
           3       0.30      0.04      0.08        69
           4       0.40      0.19      0.26        88

    accuracy                           0.68      50

In [None]:
# %%

# Best parameters for xgboost

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
# Define the model
model = XGBClassifier(random_state=2024)
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}
# Setup GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
# Fit the model
grid_search.fit(X_train, y_train)
# Get the best estimator
best_model = grid_search.best_estimator_
# Predict on the test set
y_pred = best_model.predict(X_test)
# Evaluate the model
print(classification_report(y_test, y_pred))
# Print the best parameters
print("Best parameters for xgboost found: ", grid_search.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
              precision    recall  f1-score   support

           0       0.74      0.91      0.82      3365
           1       0.45      0.32      0.37      1282
           2       0.18      0.02      0.03       261
           3       0.33      0.03      0.05        69
           4       0.46      0.15      0.22        88

    accuracy                           0.69      5065
   macro avg       0.43      0.28      0.30      5065
weighted avg       0.63      0.69      0.64      5065

Best parameters for xgboost found:  {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.9}
