## 1. Import necessary libraries 

In [1]:
# import pandas
import pandas as pd


## 2.Load the dataset

In [6]:
# Load the dataset test.csv
df = pd.read_csv('test.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,31,32,33,34,35,36,37,38,39,label
0,0,-0.217681,0.357015,-0.6227,-0.828995,-0.493001,0.8996,0.61037,0.747294,1.586017,...,0.681953,-0.589365,-0.151785,-0.310267,0.849602,0.3073,1.305479,-0.591571,0.324166,75.003235
1,1,-1.773032,2.644343,1.958347,0.308051,0.496699,0.25374,-0.343192,0.269127,-1.072743,...,-2.135674,-0.46531,-2.530288,3.137749,-0.105948,0.467693,-0.595661,-0.029263,1.056057,20.4962
2,2,0.484733,-0.18048,-0.252354,0.11327,-1.563191,0.298753,-0.668144,0.919229,-0.645964,...,-0.955123,0.88311,0.12267,0.423599,-0.077837,-0.751791,-0.334775,1.281016,2.062525,6.252874
3,3,1.441273,-1.51937,0.404982,-0.92693,0.917862,1.266911,-1.024388,-3.241267,0.504987,...,0.19906,2.122156,-0.474945,-0.600217,1.032465,-0.707669,-0.981509,-1.430141,0.069802,-107.491031
4,4,1.09131,1.227669,-1.555896,0.558327,0.833529,1.672572,-0.920674,0.538756,-0.581681,...,2.31933,-1.993736,0.034083,0.393318,0.374057,0.419019,1.213098,-0.903908,0.192049,-332.121927


## 3. Split the dependent variables(y) and independent variables(X)

In [10]:
# Spliting the data into dependent variables(y) and independent variables(X)
X = df.drop(columns='label')
X.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,0,-0.217681,0.357015,-0.6227,-0.828995,-0.493001,0.8996,0.61037,0.747294,1.586017,...,2.133033,0.681953,-0.589365,-0.151785,-0.310267,0.849602,0.3073,1.305479,-0.591571,0.324166
1,1,-1.773032,2.644343,1.958347,0.308051,0.496699,0.25374,-0.343192,0.269127,-1.072743,...,2.074083,-2.135674,-0.46531,-2.530288,3.137749,-0.105948,0.467693,-0.595661,-0.029263,1.056057
2,2,0.484733,-0.18048,-0.252354,0.11327,-1.563191,0.298753,-0.668144,0.919229,-0.645964,...,-0.482744,-0.955123,0.88311,0.12267,0.423599,-0.077837,-0.751791,-0.334775,1.281016,2.062525
3,3,1.441273,-1.51937,0.404982,-0.92693,0.917862,1.266911,-1.024388,-3.241267,0.504987,...,-1.200296,0.19906,2.122156,-0.474945,-0.600217,1.032465,-0.707669,-0.981509,-1.430141,0.069802
4,4,1.09131,1.227669,-1.555896,0.558327,0.833529,1.672572,-0.920674,0.538756,-0.581681,...,-0.649278,2.31933,-1.993736,0.034083,0.393318,0.374057,0.419019,1.213098,-0.903908,0.192049


In [12]:
y = df['label']
y.head()

0     75.003235
1     20.496200
2      6.252874
3   -107.491031
4   -332.121927
Name: label, dtype: float64

## 4. Train Test Split


In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


## 5. Train the model 

In [18]:
# import the LinearRegression model from sklearn
from sklearn.linear_model import LinearRegression

In [20]:
# Train the model 
model = LinearRegression()
model.fit(X_train, y_train)

## 6. Predicting the Model on the Test Set

In [22]:
# Predicting the dependent variable values for the test set
y_pred = model.predict(X_test)

## 7. Evaluate the model 

In [25]:
# compute mse, rmse, r2_score
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score, mean_absolute_error
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('The mean squared error is:', mse)
print('The root mean squared error is:', rmse)
print('The mean absolute error is:', mae)
print('The r2_score (accuracy) is:', r2)

The mean squared error is: 10027.564713341211
The root mean squared error is: 100.13772872070352
The mean absolute error is: 76.93628725970812
The r2_score (accuracy) is: 0.7457602741619299


### Is the Model Overfitting? Is the RÂ² Score Low?

If your model performs very well on the training set but poorly on the test set, itâ€™s likely overfitting, meaning the model has learned the noise and details of the training data instead of the general pattern.

When the RÂ² score on the test set is low, it shows that the model isnâ€™t generalizing well to new data.

### Itâ€™s time to apply Regularization!
Regularization techniques like L1 (Lasso) and L2 (Ridge) help reduce overfitting by penalizing large coefficients, leading to a simpler and more stable model.

In [27]:
# importing the necessary libraries
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV


In [29]:
# Define Ridge object
ridge = Ridge()

In [31]:
# Define parameters
parameters = {'alpha': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,18,20]}

### Create GridSearchCV
- Use ``GridSearchCV(estimator=ridge, parameters, cv=5, scoring='neg_mean_squared_error')``

In [35]:
# creating GridSearchCV
grid = GridSearchCV(estimator=ridge, param_grid=parameters, cv=5, scoring='neg_mean_squared_error')

### Fit GridSearchCV
- ``grid.fit(X_train, y_train)`` â€” this finds best alpha by cross-validation.

In [38]:
# Train the model
grid.fit(X_train, y_train)

### Evaluate on test set
- Use ``grid.predict(X_test)`` and evaluate **MSE**, **RMSE**,**MAE** **RÂ²**

In [41]:
# Predict on X_test
y_pred_ridge = grid.predict(X_test)

In [43]:
# Compute metrics
mse = mean_squared_error(y_test, y_pred_ridge)
rmse = root_mean_squared_error(y_test, y_pred_ridge)
mae = mean_absolute_error(y_test, y_pred_ridge)
r2 = r2_score(y_test, y_pred_ridge)

In [45]:
print('The mean squared error is:', mse)
print('The root mean squared error is:', rmse)
print('The mean absolute error is:', mae)
print('The r2_score (accuracy) is:', r2)

The mean squared error is: 3876.9152456071106
The root mean squared error is: 62.26487971245998
The mean absolute error is: 55.86889873074985
The r2_score (accuracy) is: 0.9017043621938233


_**Now see the difference how model imporves.**_ ðŸŽ‰

|

**Can you repeat same process for Lasso (L1)?** <br/>
-> Yes sir, I can!

In [48]:
lasso = Lasso()

In [50]:
# Define parameters
parameters_lasso = {'alpha': [1,2,3,4,5,6,7,8,9,10,11,12,13,15,18,20]}

In [52]:
# creating GridSearchCV
grid_lasso = GridSearchCV(estimator=lasso, param_grid=parameters_lasso, cv=5, scoring='neg_mean_squared_error')

In [54]:
# Train the model
grid_lasso.fit(X_train, y_train)

In [56]:
# Predict on X_test
y_pred_lasso = grid_lasso.predict(X_test)


In [58]:
# Compute metrics
mse = mean_squared_error(y_test, y_pred_lasso)
rmse = root_mean_squared_error(y_test, y_pred_lasso)
mae = mean_absolute_error(y_test, y_pred_lasso)
r2 = r2_score(y_test, y_pred_lasso)

In [60]:
print('The mean squared error is:', mse)
print('The root mean squared error is:', rmse)
print('The mean absolute error is:', mae)
print('The r2_score (accuracy) is:', r2)

The mean squared error is: 721.8632335024025
The root mean squared error is: 26.867512603559014
The mean absolute error is: 22.39658517317427
The r2_score (accuracy) is: 0.9816978183811609


In [62]:
print('=== Model Comparison Summary ===')
print('The difference how models improve! ðŸŽ‰')
print(f'Basic Linear Regression RÂ²: {r2_score(y_test, model.predict(X_test)):.4f}')
print(f'Ridge Regression RÂ²: {r2_score(y_test, y_pred_ridge):.4f}')
print(f'Lasso Regression RÂ²: {r2_score(y_test, y_pred_lasso):.4f}')

=== Model Comparison Summary ===
The difference how models improve! ðŸŽ‰
Basic Linear Regression RÂ²: 0.7458
Ridge Regression RÂ²: 0.9017
Lasso Regression RÂ²: 0.9817
