Necessary imports are vast, but generally either: standard DS library or particular ML algorithm

In [51]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *

from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


In [77]:
%matplotlib qt
rng = np.random.default_rng(100)

SMALL_SIZE = 20
MEDIUM_SIZE = 24
BIGGER_SIZE = 32
CHONK_SIZE = 38
font = {'family' : 'DIN Condensed',
        'weight' : 'bold',
        'size'   : SMALL_SIZE}
plt.rc('font', **font)
plt.rc('axes', titlesize=BIGGER_SIZE, labelsize=MEDIUM_SIZE, facecolor="xkcd:white")
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=CHONK_SIZE, facecolor="xkcd:white", edgecolor="xkcd:black") #  powder blue

drop_lst = ["MU CAPE", "MU CIN", "MU LCL", "MU LFC", "MU EL", "MU LI", "MU hght0c", "MU cap", "MU b3km", "MU brn", "SB CAPE", "SB CIN", "SB LCL", "SB LFC", "SB EL", "SB LI", "SB hght0c",
"SB cap", "SB b3km", "SB brn", "sb_tlcl", "mu_tlcl"]

col_names = ['CAPE', 'CIN', 'LCL', 'LFC', 'EL', 'LI', 'HGHT0C',
            'CAP', 'B3KM', 'BRN', 'SHEAR 0-1 KM', 'SHEAR 0-6 KM',
            'EFF INFLOW', 'EBWD', 'SRH 0-1 KM', 'SRH 0-3 KM', 'EFF SRH', 'SCP',
            'STP-FIXED', 'STP-MIXED', 'SHIP', 'PWAT', 'DCAPE', 'MLMR', 'LRAT',
            'TEI', 'TLCL', 'T500', 'SWEAT', 'K-INDEX', 'CRAV', 'HAIL SIZE IN']

Limited preprocessing here, just reading in data, scaling it, and going with train:test split of 3:1.

In [2]:
path = "/Users/joshuaelms/Desktop/github_repos/CSCI-B365/Meteorology_Modeling_Project/data/pretty_data.csv"
df = pd.read_csv(path)

df_X = df.drop(labels = "Hailstone Size", axis = 1)
df_y = df["Hailstone Size"]

X = df_X.values
y = df_y.values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state = 1879)


Perform 5-fold cross validation for 10 different algorithms, sort list by which algorithm performs best.

Note: should have kept time variable in equations here, as cross-validation can be aided by time (not IID otherwise)

In [29]:
algorithms = [LinearRegression(), LGBMRegressor(), XGBRegressor(), CatBoostRegressor(), SGDRegressor(), KernelRidge(), ElasticNet(), BayesianRidge(), GradientBoostingRegressor(), SVR()]
names = ["LinearRegression", "LGBMRegressor", "XGBRegressor", "CatBoostRegressor", "SGDRegressor", "KernelRidge", "ElasticNet", "BayesianRidge", "GradientBoostingRegressor", "SVR"]

performance_dict = {}

for i, algorithm in enumerate(algorithms):
    kwargs = {"estimator": algorithm, "X": X_train, "y": y_train, "cv": 5, "return_estimator": True}
    results_dict = cross_validate(**kwargs)
    # test_scores = 
    performance_dict[names[i]] = results_dict

performance_dict



Learning rate set to 0.064331
0:	learn: 0.5222489	total: 53.1ms	remaining: 53s
1:	learn: 0.5206851	total: 68.1ms	remaining: 34s
2:	learn: 0.5192904	total: 79.2ms	remaining: 26.3s
3:	learn: 0.5178247	total: 94.9ms	remaining: 23.6s
4:	learn: 0.5166476	total: 106ms	remaining: 21.2s
5:	learn: 0.5155200	total: 116ms	remaining: 19.3s
6:	learn: 0.5144794	total: 129ms	remaining: 18.2s
7:	learn: 0.5135903	total: 142ms	remaining: 17.6s
8:	learn: 0.5126695	total: 153ms	remaining: 16.8s
9:	learn: 0.5117973	total: 164ms	remaining: 16.2s
10:	learn: 0.5109748	total: 177ms	remaining: 15.9s
11:	learn: 0.5103081	total: 206ms	remaining: 17s
12:	learn: 0.5095891	total: 236ms	remaining: 17.9s
13:	learn: 0.5090480	total: 277ms	remaining: 19.5s
14:	learn: 0.5084058	total: 312ms	remaining: 20.5s
15:	learn: 0.5078921	total: 325ms	remaining: 20s
16:	learn: 0.5074114	total: 337ms	remaining: 19.5s
17:	learn: 0.5068474	total: 349ms	remaining: 19s
18:	learn: 0.5064018	total: 362ms	remaining: 18.7s
19:	learn: 0.5059

{'LinearRegression': {'fit_time': array([0.14953303, 0.07193899, 0.05760002, 0.05400515, 0.05890703]),
  'score_time': array([0.00774693, 0.00129485, 0.0017302 , 0.00156784, 0.00129318]),
  'estimator': [LinearRegression(),
   LinearRegression(),
   LinearRegression(),
   LinearRegression(),
   LinearRegression()],
  'test_score': array([0.05954167, 0.0610645 , 0.06681388, 0.06713877, 0.03307559])},
 'LGBMRegressor': {'fit_time': array([1.51356411, 0.72931314, 0.79235482, 0.87956095, 0.77128386]),
  'score_time': array([0.01518106, 0.02098298, 0.01409626, 0.02378798, 0.01248813]),
  'estimator': [LGBMRegressor(),
   LGBMRegressor(),
   LGBMRegressor(),
   LGBMRegressor(),
   LGBMRegressor()],
  'test_score': array([0.0815282 , 0.07432976, 0.10912707, 0.09496173, 0.0884023 ])},
 'XGBRegressor': {'fit_time': array([11.263273  , 12.05481601, 12.52139115, 12.31602383, 12.7584362 ]),
  'score_time': array([0.0170188 , 0.01834702, 0.01343894, 0.0134871 , 0.01376104]),
  'estimator': [XGBRegr

In [49]:
best_models = performance_dict["ElasticNet"]["estimator"]

best_preds = [model.predict(X_test) for model in best_models]

len(best_preds[0])

# mae = [mean_absolute_error(y_test, pred) for pred in best_preds]
# mae

# ENresiduals = [pred - y_test for pred in best_preds]

# new = [0 if ENresiduals[i] < 0 else ENresiduals[i] for i in range(len(ENresiduals))]






7276

In [79]:
df_lst = [pd.DataFrame(value) for key, value in performance_dict.items()]

for df in df_lst:
    df = df.sort_values(by = ['test_score'])

scores = {}
for name in names:
    scores[name] = performance_dict[name]["test_score"]

df_scores = pd.DataFrame(scores)
sorted = df_scores.mean().abs().sort_values()

fig, ax = plt.subplots()
ax = df_scores.mean().abs().sort_values().plot.barh(**{"logx": True})
fig.suptitle("5-Fold CV MAE on Log Scale ")

plt.show()
# sns.barplot( )

# scores = {performance_dict[name] for name in names}
plt.subplots_adjust(
top=0.89,
bottom=0.09,
left=0.32,
right=0.96,
hspace=0.185,
wspace=0.2)
# scores



In [53]:
clf = SGDRegressor()
clf2 = KNeighborsRegressor()

knns = cross_validate(**{"estimator": clf2, "X": X_train, "y": y_train, "cv": 5, "return_estimator": True})

# scores = cross_val_predict(clf, X_scaled, y, cv = 5)
# scores
# residuals = scores - y
# residuals

# sns.pointplot(x = y, y = residuals)

In [56]:
pred_knn = [model.predict(X_test) for model in knns["estimator"]]


mae = [mean_absolute_error(y_test, pred) for pred in pred_knn]
mae


[0.3856797691039033,
 0.38531088510170425,
 0.38661737218251785,
 0.3848007146783947,
 0.385153106102254]

In [57]:
knns

{'fit_time': array([0.01050401, 0.0071559 , 0.00668073, 0.00648499, 0.00835204]),
 'score_time': array([2.57653499, 1.82548714, 2.07405305, 2.01010728, 1.94949031]),
 'estimator': [KNeighborsRegressor(),
  KNeighborsRegressor(),
  KNeighborsRegressor(),
  KNeighborsRegressor(),
  KNeighborsRegressor()],
 'test_score': array([-0.03891767, -0.01724103, -0.01372886, -0.01705853, -0.0237274 ])}