In [38]:
import pandas as pd
import numpy as np
import os


from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [56]:
df= pd.read_csv("C:\\Users\\brohi\\Downloads\\cars(in).csv")
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


# Data Cleaning

In [57]:
df.isnull().sum()

Unnamed: 0    0
mpg           0
cyl           0
disp          0
hp            0
drat          0
wt            0
qsec          0
vs            0
am            0
gear          0
carb          0
dtype: int64

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  32 non-null     object 
 1   mpg         32 non-null     float64
 2   cyl         32 non-null     int64  
 3   disp        32 non-null     float64
 4   hp          32 non-null     int64  
 5   drat        32 non-null     float64
 6   wt          32 non-null     float64
 7   qsec        32 non-null     float64
 8   vs          32 non-null     int64  
 9   am          32 non-null     int64  
 10  gear        32 non-null     int64  
 11  carb        32 non-null     int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 3.1+ KB


In [None]:
df.rename(columns={'Unnamed: 0': 'car_model'}, inplace=True)


In [None]:

df.drop(columns=['car_model'], inplace=True)

In [60]:
df.value_counts().sum()

32

# Spliting the data into Training Testing


In [68]:
# Drop irrelevant columns (like car name)
X = df.drop(columns=['mpg'])
y = df['mpg']

In [70]:
# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# Use StandardScaler

In [71]:
# Define pipeline with scaler + model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])


In [73]:
pipeline

In [72]:
pipeline.set_params(knn__n_neighbors=5)
# Fit the model
pipeline.fit(X_train, y_train)
# Make predictions
y_pred = pipeline.predict(X_test)
# Evaluate the model
r2 = r2_score(y_test, y_pred)
print(f"R^2 score: {r2:.2f}")


R^2 score: 0.78


In [74]:

# Set parameter grid for tuning
param_grid = {'knn__n_neighbors': [3, 5, 7, 9]}

In [76]:
# Use KFold cross-validation inside GridSearchCV
kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=kf, scoring='r2')
grid_search.fit(X_train, y_train)

In [80]:

# Best hyperparameter
print("Best n_neighbors:", grid_search.best_params_['knn__n_neighbors'])

Best n_neighbors: 9


In [81]:
# Evaluate on test set
y_pred = grid_search.predict(X_test)
print("Test R2:", r2_score(y_test, y_pred))
print("Test MAE:", mean_absolute_error(y_test, y_pred))

Test R2: 0.7330405864465337
Test MAE: 2.4650793650793665


In [79]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

# Sample dataset
X, y = make_regression(n_samples=100, n_features=3, noise=10)

# Model
model = LinearRegression()

# Define 5-fold cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate model with K-Fold
scores = cross_val_score(model, X, y, cv=kf, scoring='r2')  # or 'neg_mean_squared_error'

print("R² scores:", scores)
print("Average R²:", scores.mean())


# this  model is performing extremely well across all the folds in K-Fold Cross Validation.



R² scores: [0.99521153 0.99174746 0.9907682  0.99651109 0.99024687]
Average R²: 0.9928970302105353


# Hyperparameter Tunning 

In [82]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import pandas as pd

# Optional: handle categorical features if needed
# X = pd.get_dummies(X, drop_first=True)


In [83]:
# Create pipeline (scaling + model)
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Ridge())
])

# Define parameter grid
param_grid = {
    'model__alpha': [0.01, 0.1, 1, 10, 100]
}

# K-Fold setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='r2',
    cv=cv,
    n_jobs=-1
)

In [84]:
# Fit GridSearchCV
grid_search.fit(X, y)

# Print best results
print("Best parameters:", grid_search.best_params_)
print("Best R² score:", grid_search.best_score_)

Best parameters: {'model__alpha': 0.1}
Best R² score: 0.992905183294378


alpha = 1 gives the best balance between fitting the data well and keeping the model simple (not overfitting).

aplha = 0.1
 means data might be such that allowing more flexibility (less penalty on coefficients) helps the model generalize better.



# Testing with alplha 

In [85]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

best_alpha = grid_search.best_params_['model__alpha']

pipeline_final = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=best_alpha))
])

pipeline_final.fit(X, y)  # Train on full data


In [86]:
ridge_model = pipeline_final.named_steps['ridge']
print("Coefficients:", ridge_model.coef_)


Coefficients: [ 96.08865366   5.22073797 101.36491996]


# KNN

In [87]:
# KNN Regression
model = make_pipeline(StandardScaler(), KNeighborsRegressor())
model.fit(X_train, y_train)

print("R² score:", r2_score(y_test, model.predict(X_test)))


R² score: 0.778530435581507


# linear regresssion with scaling 

In [88]:

model = make_pipeline(StandardScaler(), LinearRegression())
model.fit(X_train, y_train)

print("R² score:", r2_score(y_test, model.predict(X_test)))


R² score: 0.7466453084791007


In [89]:
## KNN model is more accurate than Linear Regression model in this case.

In [55]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# y_test: actual values
# predictions: predicted values from your model

print("R² score:", r2_score(y_test, predictions))
print("MAE:", mean_absolute_error(y_test, predictions))
print("MSE:", mean_squared_error(y_test, predictions))

R² score: 0.8235775907615523
MAE: 2835.934570056498
MSE: 13777572.347452844


In [43]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(StandardScaler(), KNeighborsRegressor())

params = {'kneighborsregressor__n_neighbors': [3,5,7]}

grid = GridSearchCV(pipe, params, cv=5)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)


{'kneighborsregressor__n_neighbors': 7}
0.822789757894542
