In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../data/cleaned_car_data.csv')
df.head()

Unnamed: 0,Car Brand,Model,Year,Mileage,Fuel Type,Engine Size,Transmission,Body Type,Color,Owner History,Price,Age
0,nissan,model d,2006,244586,diesel,1.1,automatic,coupe,red,third owner,6501.73,17
1,honda,model d,2006,89556,electric,4.4,automatic,coupe,gray,second owner,15860.51,17
2,ford,model e,2007,258273,hybrid,1.4,manual,suv,black,first owner,8136.09,16
3,kia,model e,2023,217592,electric,3.7,automatic,coupe,red,second owner,18556.4,0
4,chevrolet,model c,2002,256919,hybrid,2.8,manual,suv,blue,third owner,7486.0,21


In [3]:
df.isnull().sum()

Car Brand        0
Model            0
Year             0
Mileage          0
Fuel Type        0
Engine Size      0
Transmission     0
Body Type        0
Color            0
Owner History    0
Price            0
Age              0
dtype: int64

In [4]:
df.dtypes

Car Brand         object
Model             object
Year               int64
Mileage            int64
Fuel Type         object
Engine Size      float64
Transmission      object
Body Type         object
Color             object
Owner History     object
Price            float64
Age                int64
dtype: object

In [5]:
X = df.drop('Price',axis=1)
y = df['Price']

In [7]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = ['Car Brand','Model', 'Fuel Type', 'Transmission','Body Type', 'Color', 'Owner History']

In [8]:
preprocesser = ColumnTransformer([
    ('OneHotEncoder', OneHotEncoder(), cat_features),
    ('MinMaxScaler', StandardScaler(), num_features)
])
X_preprocessed = preprocesser.fit_transform(X)
X_preprocessed.shape

(2000, 39)

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X_preprocessed,y, test_size=0.2, random_state=42)
scores = []

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import cross_val_score

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_train = linear_model.predict(X_train)
y_pred_test = linear_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test,y_pred_test)
cv = cross_val_score(linear_model, X_train,y_train,cv=5, scoring='r2')
rmse = np.sqrt(mse)

print("RMSE:", rmse)
print("r2_score_train:", r2_train)
print("r2_socre_test:",r2_test)
print("Cross validation score:", cv.mean())

scores.append({'Linear Regression': {'R2_Score_Train': r2_train, 'R2_Score_Test':r2_test,'MSE':mse,'Cross validation score': cv.mean()}})

RMSE: 1394.8697752480844
r2_score_train: 0.9203248984204583
r2_socre_test: 0.8934441910972903
Cross validation score: 0.9152470700075016
