# <center>Model Building</center>

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from joblib import dump
from sklearn.model_selection import GridSearchCV,cross_val_predict
from sklearn.linear_model import Ridge,Lasso,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [3]:
df= pd.read_csv("Datasets/final_dataset.csv")

In [4]:
df.head()

Unnamed: 0,brand,model,year,km_driven,fuel_type,transmission_type,mileage,selling_price
0,maruti,wagon r,2010,72000,cng,manual,26.6,175000
1,hyundai,creta,2015,41000,diesel,manual,19.67,1250000
2,honda,jazz,2011,46000,petrol,manual,18.2,450000
3,maruti,ertiga,2012,87000,diesel,manual,20.77,600000
4,audi,a4,2013,40670,diesel,automatic,15.2,1773999


In [5]:
X = df.drop('selling_price', axis=1)
y = df['selling_price']

In [6]:
categorical = ['brand', 'model', 'fuel_type', 'transmission_type']
numerical = ['year', 'km_driven', 'mileage']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numerical)
])

## Linear Regression Model

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
linear_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
linear_pipe.fit(X_train, y_train)

In [12]:
y_pred = linear_pipe.predict(X_test)

print("üìä Linear Regression Performance:")
# R¬≤ score
r2_scores = cross_val_score(linear_pipe, X, y, cv=5, scoring='r2')
print(f"Average R¬≤ Score: {r2_scores.mean():.4f}")
print(f"All R¬≤ Scores: {r2_scores}")
print(f"RMSE    : {np.sqrt(mean_squared_error(y_test, y_pred)):,.2f}")
print(f"MAE     : {mean_absolute_error(y_test, y_pred):,.2f}")

üìä Linear Regression Performance:
Average R¬≤ Score: 0.6943
All R¬≤ Scores: [0.72475708 0.74447556 0.52470959 0.67601423 0.80158978]
RMSE    : 790,586.21
MAE     : 226,679.27


In [13]:
dump(linear_pipe, "models/linear_model.pkl")

['models/linear_model.pkl']

## Ridge Regression

In [14]:
ridge_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Ridge())
])

In [15]:
param_grid = {
    'model__alpha': [0.01, 0.1, 1, 10, 50, 100]
}

grid_ridge = GridSearchCV(ridge_pipe, param_grid, cv=5, scoring='r2')
grid_ridge.fit(X, y)

In [16]:
print("üîç Best Alpha:", grid_ridge.best_params_['model__alpha'])
print("üìä Best R¬≤ Score (CV):", grid_ridge.best_score_)

üîç Best Alpha: 0.1
üìä Best R¬≤ Score (CV): 0.6947767158542815


In [17]:
# Predict with CV
y_pred = cross_val_predict(grid_ridge.best_estimator_, X, y, cv=5)

In [18]:
# Metrics
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
mae = mean_absolute_error(y, y_pred)

In [19]:
print(f"‚úÖ Final Ridge Regression Performance:")
print(f"R¬≤ Score: {r2:.4f}")
print(f"RMSE    : ‚Çπ{rmse:,.2f}")
print(f"MAE     : ‚Çπ{mae:,.2f}")

‚úÖ Final Ridge Regression Performance:
R¬≤ Score: 0.6962
RMSE    : ‚Çπ532,214.91
MAE     : ‚Çπ225,817.48


In [20]:
dump(grid_ridge, "models/ridge_model.pkl")

['models/ridge_model.pkl']

## Lasso Regression

In [21]:
lasso_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', Lasso(max_iter=20000,alpha=10))
])

In [22]:
lasso_pipe.fit(X_train,y_train)

  model = cd_fast.sparse_enet_coordinate_descent(


In [23]:
y_pred = lasso_pipe.predict(X_test)

In [24]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

In [25]:
print("‚úÖ Final Lasso Regression Performance:")
print(f"R¬≤ Score: {r2:.4f}")
print(f"RMSE    : ‚Çπ{rmse:,.2f}")
print(f"MAE     : ‚Çπ{mae:,.2f}")

‚úÖ Final Lasso Regression Performance:
R¬≤ Score: 0.4994
RMSE    : ‚Çπ809,272.45
MAE     : ‚Çπ227,337.11


In [26]:
dump(lasso_pipe, "models/lasso_model.pkl")

['models/lasso_model.pkl']

## ElasticNet Model

In [27]:
elasticnet_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=10000, random_state=42))
])

In [28]:
param_grid = {
    'model__alpha': [0.01, 0.1, 1, 10, 100],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

In [29]:
grid_elastic = GridSearchCV(elasticnet_pipe, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_elastic.fit(X_train, y_train)

In [30]:
best_elastic_model = grid_elastic.best_estimator_
y_pred = best_elastic_model.predict(X_test)

In [31]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("üìä ElasticNet Regression Performance:")
print(f"R¬≤ Score: {r2:.4f}")
print(f"RMSE    : ‚Çπ{rmse:,.2f}")
print(f"MAE     : ‚Çπ{mae:,.2f}")

üìä ElasticNet Regression Performance:
R¬≤ Score: 0.5014
RMSE    : ‚Çπ807,633.97
MAE     : ‚Çπ244,101.88


In [32]:
dump(best_elastic_model, "models/elasticnet_model.pkl")

['models/elasticnet_model.pkl']

## Decsicion Tree Regressor

In [33]:
dt_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

In [34]:
param_grid = {
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

In [35]:
grid_dt = GridSearchCV(dt_pipe, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_dt.fit(X_train, y_train)

In [36]:
# Best model
best_dt = grid_dt.best_estimator_
y_pred = best_dt.predict(X_test)

In [37]:
# Run 5-fold cross-validation
scores = cross_val_score(best_dt, X, y, cv=5, scoring='r2')

In [38]:
scores

array([0.67447698, 0.71116399, 0.55012339, 0.75665931, 0.85845171])

In [39]:
r2 = scores.mean()
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("üìä Decision Tree Regression Performance:")
print(f"R¬≤ Score: {r2:.4f}")
print(f"RMSE    : ‚Çπ{rmse:,.2f}")
print(f"MAE     : ‚Çπ{mae:,.2f}")

üìä Decision Tree Regression Performance:
R¬≤ Score: 0.7102
RMSE    : ‚Çπ724,817.79
MAE     : ‚Çπ184,818.99


In [40]:
dump(grid_dt, "models/dt_model.pkl")

['models/dt_model.pkl']

## Random Forest Regression

In [41]:
rf_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

In [42]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}


In [44]:
grid_rf = GridSearchCV(
    rf_pipe,
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [45]:
best_rf = grid_rf.best_estimator_
y_pred = best_rf.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("üìä Random Forest Regressor Performance:")
print(f"R¬≤ Score: {r2:.4f}")
print(f"RMSE    : ‚Çπ{rmse:,.2f}")
print(f"MAE     : ‚Çπ{mae:,.2f}")

üìä Random Forest Regressor Performance:
R¬≤ Score: 0.6506
RMSE    : ‚Çπ676,037.42
MAE     : ‚Çπ152,040.56


In [46]:
dump(best_rf, "models/rf_model.pkl", compress=3)

['models/rf_model.pkl']

## XGBoost Regressor

In [47]:
xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1))
])

In [48]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__subsample': [0.8, 1.0]
}

In [49]:
grid_xgb = GridSearchCV(
    xgb_pipe,
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [50]:
best_xgb = grid_xgb.best_estimator_
y_pred = best_xgb.predict(X_test)

In [51]:
cv_scores = cross_val_score(best_xgb, X, y, cv=5, scoring='r2')

print("üìä Cross-Validation R¬≤ Scores:", cv_scores)
print(f"üìà Average R¬≤ Score          : {cv_scores.mean():.4f}")

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print(f"RMSE    : ‚Çπ{rmse:,.2f}")
print(f"MAE     : ‚Çπ{mae:,.2f}")

üìä Cross-Validation R¬≤ Scores: [0.79464155 0.82650534 0.69621055 0.8174302  0.91035402]
üìà Average R¬≤ Score          : 0.8090
RMSE    : ‚Çπ664,623.90
MAE     : ‚Çπ156,211.75


In [52]:
dump(best_xgb, "models/xgb_model.pkl")

['models/xgb_model.pkl']

## Gradient Boosting Regressor

In [54]:
gbr_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])

In [55]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [3, 5],
    'model__subsample': [0.8, 1.0]
}

In [56]:
grid_gbr = GridSearchCV(
    gbr_pipe,
    param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_gbr.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [57]:
best_gbr = grid_gbr.best_estimator_
y_pred = best_gbr.predict(X_test)

In [58]:
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("‚úÖ Final Gradient Boosting Regressor Performance:")
print("Best Params:", grid_gbr.best_params_)
print(f"R¬≤ Score   : {r2:.4f}")
print(f"RMSE       : ‚Çπ{rmse:,.2f}")
print(f"MAE        : ‚Çπ{mae:,.2f}")

‚úÖ Final Gradient Boosting Regressor Performance:
Best Params: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__n_estimators': 200, 'model__subsample': 0.8}
R¬≤ Score   : 0.6208
RMSE       : ‚Çπ704,307.07
MAE        : ‚Çπ165,249.81


In [59]:
cv_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='r2')

print("\nüìä Cross-Validation R¬≤ Scores:", cv_scores)
print(f"üìà Average R¬≤ Score          : {cv_scores.mean():.4f}")


üìä Cross-Validation R¬≤ Scores: [0.7803354  0.82567048 0.65176855 0.80282803 0.90991653]
üìà Average R¬≤ Score          : 0.7941


In [60]:
dump(best_gbr, "models/gbr_model.pkl")

['models/gbr_model.pkl']

## KNN Regressor

In [61]:
knn_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor())
])

In [62]:
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9]
}

In [63]:
grid_knn = GridSearchCV(knn_pipe, param_grid, cv=5, scoring='r2')
grid_knn.fit(X_train, y_train)

In [64]:
best_knn = grid_knn.best_estimator_
print("Best KNN Params:", grid_knn.best_params_)

Best KNN Params: {'model__n_neighbors': 5}


In [65]:
y_pred = best_knn.predict(X_test)

cv_scores_knn = cross_val_score(best_knn, X, y, cv=5, scoring='r2')
print("üìä KNN Regressor Cross-Validation R¬≤ Scores:", cv_scores_knn)
print(f"üìà Average R¬≤ Score for KNN: {cv_scores_knn.mean():.4f}")
print(f"RMSE    : ‚Çπ{np.sqrt(mean_squared_error(y_test, y_pred)):,.2f}")
print(f"MAE     : ‚Çπ{mean_absolute_error(y_test, y_pred):,.2f}")

üìä KNN Regressor Cross-Validation R¬≤ Scores: [0.7381041  0.821957   0.68169556 0.82014482 0.89939472]
üìà Average R¬≤ Score for KNN: 0.7923
RMSE    : ‚Çπ676,006.73
MAE     : ‚Çπ149,499.68


In [66]:
dump(best_knn, "models/knn_model.pkl")

['models/knn_model.pkl']