In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso, RANSACRegressor, TheilSenRegressor, Ridge, ElasticNet
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, PolynomialFeatures, StandardScaler, RobustScaler
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')


## Reading Train/Test Values

In [2]:
df1 = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
subm = pd.read_csv("sample_submission.csv")
df1

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,0.5500,0.4125,0.1625,1.715145,0.609514,0.396893,0.566990,4.0
1,1,F,1.5125,1.2125,0.4000,31.312023,13.395139,6.265239,8.930093,10.0
2,2,F,1.5125,1.1750,0.4125,31.552993,14.670866,6.279414,9.922325,11.0
3,3,I,0.8000,0.6000,0.2000,4.620969,3.019222,0.978058,1.417475,7.0
4,4,I,1.3875,1.0875,0.3625,24.323871,11.651644,5.712424,5.386405,8.0
...,...,...,...,...,...,...,...,...,...,...
14995,14995,M,1.3750,1.0625,0.3500,24.734939,11.169703,6.208540,6.095142,9.0
14996,14996,I,1.0750,0.8125,0.2500,12.998246,5.896696,2.338834,2.919999,7.0
14997,14997,I,1.3750,1.1125,0.3625,23.544260,8.674947,4.791066,7.796112,15.0
14998,14998,M,1.4375,1.1125,0.3500,28.448723,14.571643,6.279414,6.803880,10.0


## Dropping the id column as its unneccessary 

In [3]:
df = df1.drop(columns="id")
df

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,I,0.5500,0.4125,0.1625,1.715145,0.609514,0.396893,0.566990,4.0
1,F,1.5125,1.2125,0.4000,31.312023,13.395139,6.265239,8.930093,10.0
2,F,1.5125,1.1750,0.4125,31.552993,14.670866,6.279414,9.922325,11.0
3,I,0.8000,0.6000,0.2000,4.620969,3.019222,0.978058,1.417475,7.0
4,I,1.3875,1.0875,0.3625,24.323871,11.651644,5.712424,5.386405,8.0
...,...,...,...,...,...,...,...,...,...
14995,M,1.3750,1.0625,0.3500,24.734939,11.169703,6.208540,6.095142,9.0
14996,I,1.0750,0.8125,0.2500,12.998246,5.896696,2.338834,2.919999,7.0
14997,I,1.3750,1.1125,0.3625,23.544260,8.674947,4.791066,7.796112,15.0
14998,M,1.4375,1.1125,0.3500,28.448723,14.571643,6.279414,6.803880,10.0


## Encoding the column named "SEX" to numerical from objective

In [4]:
df=pd.get_dummies(df, columns=['Sex'], dtype=int)
test = pd.get_dummies(test, columns=['Sex'], dtype=int)


In [5]:
df= df.drop(columns="Sex_Diameter" )

## Determining Dependent and Independent Columns//Features

In [6]:
X = df.drop(columns=["Age"], axis=1)
y= df["Age"]

## Calculating Metrics using Linear Regression

In [7]:

regression = LinearRegression()
regression.fit(X, y)

y_pred = regression.predict(X)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")



RMSE: 2.0337
MAE: 1.4051


## Calculating Metrics using Huber Regression

In [8]:

regression = HuberRegressor()
regression.fit(X, y)

y_pred = regression.predict(X)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


RMSE: 2.0716
MAE: 1.3701


## Calculating Metrics using Lasso Regression

In [9]:
regression = Lasso()
regression.fit(X, y)

y_pred = regression.predict(X)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 2.3985
MAE: 1.6779


## Calculating Metrics using ElasticNet Regression

In [10]:
regression = ElasticNet()
regression.fit(X, y)

y_pred = regression.predict(X)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 2.2508
MAE: 1.5640


## Calculating Metrics using Ridge Regression

In [11]:
regression = Ridge()
regression.fit(X, y)

y_pred = regression.predict(X)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 2.0342
MAE: 1.4050


## Calculating Metrics using Ransac Regression

In [12]:
regression = RANSACRegressor()
regression.fit(X, y)

y_pred = regression.predict(X)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 2.1485
MAE: 1.4142


## Calculating Metrics using TheilSen Regression

In [13]:
regression = TheilSenRegressor()
regression.fit(X, y)

y_pred = regression.predict(X)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 2.0507
MAE: 1.3781


## Polynomial Features with Degree 2

In [14]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

regression = LinearRegression()
regression.fit(X_poly, y)

y_pred = regression.predict(X_poly)

rmse = root_mean_squared_error(y, y_pred)
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R^2: {r2:.4f}")


RMSE: 1.9667
MAE: 1.3546
R^2: 0.6238


## Dropping ID column from Test.csv


In [15]:
X_new = test.drop(columns="id", axis=1)

## Creating Pipeline for Each regression including PolynomialRegression

In [16]:
linear_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=True)),
    ('model', LinearRegression())  
])

ridge_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=3, include_bias=True)),
    ('model', Ridge(alpha=10.0))
])

lasso_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=3, include_bias=True)),
    ('model', Lasso(alpha=0.0001))
])


huber_pipeline = Pipeline([
                          ('model', HuberRegressor())])

ransac_pipeline = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=True)),
                           ('model', RANSACRegressor())])

theilsen_pipeline = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=True)),
                           ('model', TheilSenRegressor(fit_intercept=True))])


## Checking the best Alpha degree for Ridge and Lasso Using gridSearch

In [17]:

param_dist = {
    'poly__degree': [2, 3],  
    'model__alpha': np.logspace(-4, 4, 50)  
}

ridge_random = RandomizedSearchCV(ridge_pipeline, param_dist, n_iter=20, scoring='neg_mean_absolute_error', cv=5, random_state=42)
ridge_random.fit(X, y)

print(f"Best Ridge alpha: {ridge_random.best_params_['model__alpha']}")
print(f"Best MAE for Ridge: {-ridge_random.best_score_:.4f}")

lasso_random = RandomizedSearchCV(lasso_pipeline, param_dist, n_iter=20, scoring='neg_mean_absolute_error', cv=5, random_state=42)
lasso_random.fit(X, y)

print(f"Best Lasso alpha: {lasso_random.best_params_['model__alpha']}")
print(f"Best MAE for Lasso: {-lasso_random.best_score_:.4f}")

Best Ridge alpha: 1.7575106248547894
Best MAE for Ridge: 1.3512
Best Lasso alpha: 0.0001
Best MAE for Lasso: 1.3652


## Checking the best Alpha degree for Ridge and Lasso Using RandomizedSearchCV

In [18]:
param_dist = {
    'poly__degree': [2, 3],  
    'model__alpha': np.logspace(-4, 4, 50)  
}

ridge_random = RandomizedSearchCV(ridge_pipeline, param_dist, n_iter=20, scoring='neg_mean_absolute_error', cv=5, random_state=42)
ridge_random.fit(X, y)

print(f"Best Ridge alpha: {ridge_random.best_params_['model__alpha']}")
print(f"Best MAE for Ridge: {-ridge_random.best_score_:.4f}")

lasso_random = RandomizedSearchCV(lasso_pipeline, param_dist, n_iter=20, scoring='neg_mean_absolute_error', cv=5, random_state=42)
lasso_random.fit(X, y)

print(f"Best Lasso alpha: {lasso_random.best_params_['model__alpha']}")
print(f"Best MAE for Lasso: {-lasso_random.best_score_:.4f}")

Best Ridge alpha: 1.7575106248547894
Best MAE for Ridge: 1.3512
Best Lasso alpha: 0.0001
Best MAE for Lasso: 1.3652


## Using Voting Regression 

In [19]:
model_lr = LinearRegression()
model_huber = HuberRegressor()
model_ridge = Ridge()
model_TheilSen = TheilSenRegressor()
model_ransac = RANSACRegressor()

voting_regressor = VotingRegressor(estimators=[
    ('linear', linear_pipeline),
    ('ridge', ridge_pipeline),
    ('lasso', lasso_pipeline),
    ('huber', huber_pipeline),
    ('theilsen', theilsen_pipeline)
])

voting_regressor.fit(X, y)
blend_predictions = voting_regressor.predict(X)

rmse = root_mean_squared_error(y, blend_predictions)
mae = mean_absolute_error(y, blend_predictions)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

RMSE: 1.9617
MAE: 1.3336


## Stacking Regression with Huber Reg in the final Est as it has the least MAE

In [20]:
stacking_regressor = StackingRegressor(
    estimators=[
    ('linear', linear_pipeline),
    ('ridge', ridge_pipeline),
    ('lasso', lasso_pipeline),
    ('huber', huber_pipeline),
    ('theilsen', theilsen_pipeline)
    ],
    final_estimator=HuberRegressor(),
    cv=5,
    passthrough=True
)
stacking_regressor.fit(X, y)
stack_predictions = stacking_regressor.predict(X)
stack_predictions=np.round(stack_predictions)
rmse = root_mean_squared_error(y, stack_predictions)
mae = mean_absolute_error(y, stack_predictions)

print(f"The best RMSE: {rmse:.4f}")
print(f"The best MAE: {mae:.4f}")

The best RMSE: 1.9991
The best MAE: 1.2783
