In [None]:
import pandas as pd
import os
import numpy as np
import plotly.express as px
import seaborn as sns
import math
from sklearn.preprocessing import StandardScaler,OneHotEncoder,PowerTransformer,MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV,LassoCV,LinearRegression,ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
import warnings 

In [None]:
df=pd.read_csv('car_price.csv')
df.head()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.head(2)

In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df['max_power'] = df['max_power'].str.extract(r'(\d+\.?\d*)').astype(float)

In [None]:
def number_extracter(df):
    cols=['mileage','engine']
    for iter in cols :
        df[iter]=df[iter].str.split(expand=True)[0].astype(float)
    return df
df=number_extracter(df)

In [None]:
df.head(3)

In [None]:
df.dtypes

In [None]:
df['name'].nunique()

In [None]:
df['torque'].str.split(' ',expand=True)

In [None]:
df['torque'] = df['torque'].str.split(' ',expand=True)[0].str.extract(r'(\d+\.?\d*)').astype(float)
df.head(2)

In [None]:
df.head(2)

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
imputer=KNNImputer()
cols=df[['mileage','engine','max_power','torque','seats']]
imputed =imputer.fit_transform(cols)
imputed[0:,0]

In [None]:
imputed.shape

In [None]:
def imputations(imputed,df):
    cols=['mileage','engine','max_power','torque','seats']
    n=len(cols)
    for iter in range(n):
        df[cols[iter]]=imputed[0:,iter]
    return df
df=imputations(imputed,df)
df.head(2)

In [None]:
df.isna().sum()

In [None]:
df['age']=2025-df['year']
df=df.drop(columns={'year'})
df.head()
df=df[['name','age','km_driven','fuel','seller_type','transmission','owner','mileage','engine','max_power','torque','seats','selling_price']]
df.head(2)

In [None]:
df.corr(numeric_only=True)

In [None]:
df.head(2)

In [None]:
X=df.drop(columns={'selling_price'})

In [None]:
import matplotlib.pyplot as plt
# Now Check Distribution of our data 
def distributions(X):
    num_cols=X.select_dtypes(include='number')
    num_cols.plot(
        kind='density',
        subplots=True,
        layout=(4,2),
        figsize=(20,30),
        sharex=False,
        sharey=False
    )
    plt.show()
    plt.tight_layout()
distributions(X)

In [None]:
X=df.drop(columns={'selling_price'})

In [None]:
def outliers(X):
    num_cols=X.select_dtypes(include='number')
    num_cols.plot(
        kind='box',
        subplots=True,
        layout=(4,2),
        figsize=(20,30),
        sharex=False,
        sharey=False
    )
    plt.show()
    plt.tight_layout()
outliers(X)

In [None]:
def transformations(df):
    X=df.drop(columns={'selling_price'})
    num_cols=X.select_dtypes(include='number')
    transformer=PowerTransformer(method='yeo-johnson')
    num_cols_new=transformer.fit_transform(num_cols)
    return num_cols_new
transform_arr=transformations(df)
transform_arr[0:,0]

In [None]:
def after_transformation(transform_arr,df):
    X=df.drop(columns={'selling_price'})
    num_cols=X.select_dtypes(include='number')
    names=X.select_dtypes(include='number').columns.values
    for iter in range(len(names)):
        num_cols[names[iter]] = transform_arr[0:,iter]
    return num_cols
trans_df=after_transformation(transform_arr,df)
trans_df.head()

In [None]:
outliers(trans_df)

In [None]:
distributions(trans_df)

In [None]:
# Lets scale our data using standard sacler
def scaling(trans_df):
    num_cols=trans_df.select_dtypes(include='number')
    scaler=StandardScaler()
    scaled_arr=scaler.fit_transform(num_cols)
    return scaled_arr
scaled_arr=scaling(trans_df)
scaled_arr.shape

In [None]:
def after_scaling(scaled_arr,trans_df):
    num_cols=trans_df.select_dtypes(include='number')
    names=trans_df.select_dtypes(include='number').columns.values
    for iter in range(len(names)):
        num_cols[names[iter]] = scaled_arr[0:,iter]
    return num_cols
scaled_df=after_scaling(scaled_arr,trans_df)
scaled_df.head()

In [None]:
distributions(scaled_df)

In [None]:
outliers(scaled_df)

In [None]:
df.head(2)

In [None]:
X=df.drop(columns={'name','selling_price'})
X.head()

In [None]:
y=df['selling_price']
y.head()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)
X_train.head()

In [None]:
# Creating a Categorical handling Pipeline
cat_cols=X_train.select_dtypes(include='object').columns.values.tolist()
encoding=OneHotEncoder(handle_unknown='ignore')
encoding

In [None]:
num_cols = X_train.select_dtypes(include='number').columns.values.tolist()
num_cols

In [None]:
# Creating a Numerical Values Handling Pipeline 

num_handling=Pipeline(
    [
        ('transformer',PowerTransformer(method='yeo-johnson')),
        ('scaling',StandardScaler())
    ]
)
num_handling

In [None]:
transformation=ColumnTransformer(
    [
        ('cats',encoding,cat_cols),
        ('nums',num_handling,num_cols)
    ]
)
transformation

In [None]:
full_pipeline = Pipeline([
    ('pre', transformation),                    # your ColumnTransformer
    ('poly', PolynomialFeatures(include_bias=False, interaction_only=False)),
    ('reg', LinearRegression())                 # placeholder estimator; GridSearch will try alternatives
])

In [None]:
from sklearn.linear_model import ElasticNet
param_grid = [
    # Try simple polynomial + LinearRegression
    {
        'poly__degree': [1, 2],                # degree 1 = linear, 2 = quadratic
        'reg': [LinearRegression()],
        'reg__fit_intercept': [True]           # valid for LinearRegression
    },
    # Try polynomial + Ridge (regularized) for degrees that tend to blow up features
    {
        'poly__degree': [1, 2, 3],
        'reg': [Ridge()],
        'reg__alpha': [0.01, 0.1, 1.0, 10.0], # regularization strengths
        'reg__fit_intercept': [True]
    },
    {
        'poly__degree': [1, 2, 3],
        'reg': [ElasticNet()],
        'reg__alpha': [0.01, 0.1, 1.0, 10.0], # regularization strengths
        'reg__fit_intercept': [True]
    }
]

In [None]:
# 5) GridSearchCV setup
grid = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',  # tune for MSE
    n_jobs=-1,
    verbose=2
)

In [None]:
# 6) Fit grid search
grid.fit(X_train, y_train)

# 7) Best found parameters
print("Best params:\n", grid.best_params_)
print("Best CV (neg MSE):", grid.best_score_)

# 8) Evaluate on test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

y_pred_train=best_model.predict(X_train)

mse_test = mean_squared_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)

r2_train=r2_score(y_train, y_pred_train)

print(f"Train R^2 : {r2_train:.4f}")

print(f"Test R^2 : {r2_test:.4f}")

# 9) If you want to inspect coefficients (for LinearRegression or Ridge)
# Note: After preprocessing + poly, feature names are not trivial. You can still get coef_
reg_step = best_model.named_steps['reg']

print("Number of learned coefficients:", getattr(reg_step, "coef_", None).shape)