In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("C:\\Users\\user\\Downloads\\CarPrice_Assignment.csv")
df

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


# Preprocessing

In [3]:
# Drop 'car_ID' as it is not useful
df.drop(['car_ID'], axis=1, inplace=True)

In [4]:
df['CarBrand'] = df['CarName'].apply(lambda x: x.split(" ")[0])
df.drop(['CarName'], axis=1, inplace=True)

In [5]:
categorical_cols = ['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem', 'CarBrand']
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_cols.remove('price')  # Remove the target column 'price'

In [6]:
# Preprocessing: Handling numeric and categorical separately
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [7]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])


#  Split data into train and test sets

In [8]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  Define models

In [9]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

# Train and evaluate models

In [13]:

model_performance = {} 

for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    model_performance[model_name] = {
        'R-squared': r2,
        'MSE': mse,
        'MAE': mae
    }

performance_df = pd.DataFrame(model_performance).T
print(performance_df)


                                R-squared           MSE           MAE
Linear Regression           -7.422801e+21  5.859857e+29  1.657065e+14
Decision Tree Regressor      8.962240e-01  8.192494e+06  1.923679e+03
Random Forest Regressor      9.576212e-01  3.345553e+06  1.302010e+03
Gradient Boosting Regressor  9.267908e-01  5.779426e+06  1.695765e+03
Support Vector Regressor    -9.978948e-02  8.682180e+07  5.695278e+03


# Feature Importance  analysis

In [14]:
best_model = RandomForestRegressor(random_state=42)
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', best_model)])
pipeline_rf.fit(X_train, y_train)

In [15]:
importances = pipeline_rf.named_steps['regressor'].feature_importances_
feature_names = numeric_cols + list(pipeline_rf.named_steps['preprocessor'].transformers_[1][1].named_steps['onehot'].get_feature_names_out())

In [16]:
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
print(feature_importance_df.head(10))


         Feature  Importance
6     enginesize    0.547857
5     curbweight    0.295958
13    highwaympg    0.045167
10    horsepower    0.032016
3       carwidth    0.013500
54  CarBrand_bmw    0.007762
2      carlength    0.007194
1      wheelbase    0.006675
12       citympg    0.005862
11       peakrpm    0.005572


# Hyperparameter Tuning

In [17]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(pipeline_rf, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [18]:
print("Best parameters:", grid_search.best_params_)
y_pred_best = grid_search.best_estimator_.predict(X_test)
print("Best model R2:", r2_score(y_test, y_pred_best))

Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 200}
Best model R2: 0.959525249866283
