# Computational Time Analysis

Task: Analyze the computational time needed for model training and deployment.

In [1]:
import os, sys
sys.path.append(os.path.dirname(os.path.abspath(os.getcwd())))

import random
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
import pandas as pd
import deep_snow.dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import os
from tqdm import tqdm

In [2]:
df = pd.read_parquet('/home/ayushg12/ML_GEO2024_ayushg12/mlgeo-2024-deep-snow/final_data/mlgeo-final-data/classic_ml_val_v1.parquet')
scaler = MinMaxScaler()
cols_to_normalize = [col for col in df.columns if col != 'aso_sd']
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])
print("Length of dataset:", len(df))

Length of dataset: 5763835


In [3]:
df.head(5)

Unnamed: 0,aso_sd,snowon_vv,aerosol_optical_thickness,coastal_aerosol,blue,green,red,red_edge1,red_edge2,red_edge3,...,fcf,elevation,slope,aspect,curvature,tri,tpi,latitude,longitude,dowy
0,0.0,0.034741,0.451524,0.361875,0.353116,0.365153,0.375045,0.381241,0.439588,0.464821,...,0.25,0.444744,0.140537,0.827236,0.450719,0.124839,0.455258,0.785057,0.019264,0.056818
1,0.0,0.018444,0.451524,0.59553,0.752536,0.75572,0.760649,0.789616,0.806356,0.801144,...,0.22,0.44639,0.023371,0.905367,0.770828,0.138894,0.722182,0.785054,0.019303,0.056818
2,0.0,0.168401,0.451524,0.77461,0.882065,0.891728,0.89893,0.947582,0.97059,0.955312,...,0.27,0.444428,0.228046,0.230994,0.449177,0.195033,0.473952,0.78505,0.019342,0.056818
3,0.0,0.048596,0.451524,0.522883,0.694983,0.710806,0.722003,0.752439,0.792577,0.797974,...,0.29,0.44109,0.289067,0.203127,0.166511,0.294096,0.210729,0.785047,0.019382,0.056818
4,0.0,0.023445,0.451524,0.378733,0.435581,0.452939,0.464694,0.479667,0.541448,0.564868,...,0.29,0.43925,0.280976,0.16561,0.40758,0.260839,0.361191,0.785044,0.019421,0.056818


In [4]:
# Training the model using the entire dataset
# Split data into 80% train and 20% test subsets
y = df.iloc[:,0].values
data = df.iloc[:, 1:].values

print(f"There are {data.shape[0]} data samples")
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.2, shuffle=True)

There are 5763835 data samples


In [6]:
# ElasticNet Regressor
en_reg = ElasticNet()
%timeit -r 50 en_reg.fit(X_train, y_train)
en_prediction = en_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=en_prediction))

3.13 s ± 92.2 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Ridge Mean Absolute Error: 0.6307025779907183


### Ridge Regression Computational Time

In [7]:
# Ridge Regressor
ridge_reg = Ridge(fit_intercept=False)
%timeit -r 50 ridge_reg.fit(X_train, y_train)
ridge_prediction = ridge_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=ridge_prediction))

796 ms ± 48.9 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Ridge Mean Absolute Error: 0.4806266618751161


In [6]:
# Ridge Regressor checking the effect of fit_intercept option
ridge_reg = Ridge(alpha=0, fit_intercept=True)
%timeit -r 50 ridge_reg.fit(X_train, y_train)
ridge_prediction = ridge_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=ridge_prediction))

1.22 s ± 101 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Ridge Mean Absolute Error: 0.479965903107131


In [7]:
# Ridge Regressor checking the effect of fit_intercept option by setting True
ridge_reg = Ridge(fit_intercept=True)
%timeit -r 50 ridge_reg.fit(X_train, y_train)
ridge_prediction = ridge_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=ridge_prediction))

1.2 s ± 125 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Ridge Mean Absolute Error: 0.47996603276151834


From the above runs it can be observed that when ``fit_intercept`` is set to True the runtime increases significantly as compared to when its False. Moreover, there is not much effect of ``alpha`` on runtime, as change in ``alpha`` mostly changes the no. of iterations which varies slightly leading to change in runtime.

### Dummy Regression Computational Time

In [9]:
# Dummy Regressor for median strategy
dummy_reg = DummyRegressor(strategy="median")
%timeit -r 50 dummy_reg.fit(X_train, y_train)
dummy_prediction = dummy_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=dummy_prediction))

58.3 ms ± 2.59 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)
Ridge Mean Absolute Error: 0.46820807


In [14]:
# Dummy Regressor for mean strategy
dummy_reg = DummyRegressor(strategy="quantile", quantile=0.1)
%timeit -r 50 dummy_reg.fit(X_train, y_train)
dummy_prediction = dummy_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=dummy_prediction))

40.7 ms ± 1.27 ms per loop (mean ± std. dev. of 50 runs, 10 loops each)
Ridge Mean Absolute Error: 0.4682079001899735


In [None]:
# Dummy Regressor for mean strategy
dummy_reg = DummyRegressor(strategy="mean")
%timeit -r 50 dummy_reg.fit(X_train, y_train)
dummy_prediction = dummy_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=dummy_prediction))

3.63 ms ± 258 μs per loop (mean ± std. dev. of 50 runs, 100 loops each)
Ridge Mean Absolute Error: 0.630073


From the above runs it can be clearly observed that when ``strategy`` is set to mean the runtime is significantly less as compared to when it is set as median or when quantile is used.

### Lasso Regression Computational Time

In [16]:
# Lasso Regressor
lasso_reg = Lasso(fit_intercept=False)
%timeit -r 50 lasso_reg.fit(X_train, y_train)
lasso_prediction = lasso_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=lasso_prediction))

3.41 s ± 138 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Ridge Mean Absolute Error: 0.4682079001899735


In [15]:
# Lasso Regressor checking the effect of fit_intercept option
lasso_reg = Lasso(fit_intercept=True)
%timeit -r 50 lasso_reg.fit(X_train, y_train)
lasso_prediction = lasso_reg.predict(X_test)
print("Ridge Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=lasso_prediction))

3.54 s ± 201 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Ridge Mean Absolute Error: 0.6300731065574366


From the above runs it can be observed that ``fit_intercept`` when set as True increases the runtime slightly, however the it is observed that when ``fit_intercept`` is set is False the accuracy is much higher as when set as True.

The runs for Ridge, Lasso, and Dummy Regression shows that Dummy regression takes the least runtime, followed by Ridge regression taking higher time than Dummy regression, and Lasso regression takes the highest runtime

### Voting Regression

In [17]:
# Voting Regressor (Dummy and Ridge)
dummy_reg = DummyRegressor(strategy="median")
ridge_reg = Ridge(alpha=0.193, fit_intercept=True)
voting_reg = VotingRegressor(estimators=[('dummy', dummy_reg), ('ridge', ridge_reg)])
%timeit -r 50 voting_reg.fit(X_train, y_train)
voting_prediction = voting_reg.predict(X_test)
print("Voting Regressor (Dummy + Ridge) Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=voting_prediction))

1.21 s ± 78.7 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Voting Regressor (Dummy + Ridge) Mean Absolute Error: 0.4425146906852208


In [19]:
# Voting Regressor (Dummy and Lasso)
dummy_reg = DummyRegressor(strategy="median")
lasso_reg = Lasso(fit_intercept=True)
voting_reg = VotingRegressor(estimators=[('dummy', dummy_reg), ('lasso', lasso_reg)])
%timeit -r 50 voting_reg.fit(X_train, y_train)
voting_prediction = voting_reg.predict(X_test)
print("Voting Regressor (Dummy + Lasso) Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=voting_prediction))

3.35 s ± 174 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Voting Regressor (Dummy + Lasso) Mean Absolute Error: 0.5348634118506119


In [21]:
# Voting Regressor (Ridge and Lasso)
ridge_reg = Ridge(alpha=0.193, fit_intercept=True)
lasso_reg = Lasso(fit_intercept=True)
voting_reg = VotingRegressor(estimators=[('ridge', ridge_reg), ('lasso', lasso_reg)])
%timeit -r 50 voting_reg.fit(X_train, y_train)
voting_prediction = voting_reg.predict(X_test)
print("Voting Regressor (Ridge + Lasso) Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=voting_prediction))

4.36 s ± 213 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Voting Regressor (Ridge + Lasso) Mean Absolute Error: 0.512781753975057


In [None]:
# Voting Regressor (Dummy, Ridge and Lasso)
dummy_reg = DummyRegressor(strategy="median")
ridge_reg = Ridge(alpha=0.193, fit_intercept=True)
lasso_reg = Lasso(fit_intercept=True)
voting_reg = VotingRegressor(estimators=[('dummy', dummy_reg), ('ridge', ridge_reg), ('lasso', lasso_reg)])
%timeit -r 50 voting_reg.fit(X_train, y_train)
voting_prediction = voting_reg.predict(X_test)
print("Voting Regressor (Dummy + Ridge + Lasso) Mean Absolute Error:", metrics.mean_absolute_error(y_true=y_test, y_pred=voting_prediction))

4.48 s ± 124 ms per loop (mean ± std. dev. of 50 runs, 1 loop each)
Voting Regressor (Dummy + Ridge + Lasso) Mean Absolute Error: 0.4804221462351734


From the above runs for Voting regression it can be observed that runtime for voting regression depends on the runtime of the individual models being used, due to which voting regression for dummy + ridge + lasso has the highest runtime, which the one with dummy + ridge has the lowest as both of them have the lowest individual runtimes, when compared to other models.