In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

In [4]:
df = pd.read_csv("C:/Users/murar/OneDrive/Desktop/ml_data/car data.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [6]:
df.describe(include="O")

Unnamed: 0,Car_Name,Fuel_Type,Selling_type,Transmission
count,301,301,301,301
unique,98,3,2,2
top,city,Petrol,Dealer,Manual
freq,26,239,195,261


In [8]:
t = df["Car_Name"].value_counts().reset_index()
cnt = t[t["count"] > 5].count()
cnt

Car_Name    15
count       15
dtype: int64

In [10]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.642584,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [12]:
X = df.drop("Selling_Price", axis=1)
y = df["Selling_Price"]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)
print(X_train.shape, X_test.shape)

(240, 8) (61, 8)


In [14]:
cat_cols = X_train.select_dtypes(include = 'O').columns.to_list()
num_cols = X_train.select_dtypes(exclude = "O").columns.to_list()
print(cat_cols)
print(num_cols)

['Car_Name', 'Fuel_Type', 'Selling_type', 'Transmission']
['Year', 'Present_Price', 'Driven_kms', 'Owner']


In [16]:
enc=OneHotEncoder(max_categories =5, handle_unknown = "ignore",sparse_output=False)
x_train_enc = enc.fit_transform(X_train[cat_cols])
x_train_enc = pd.DataFrame(x_train_enc, columns = enc.get_feature_names_out(cat_cols))
x_train_ = x_train_enc.join(df[num_cols])
x_train_

Unnamed: 0,Car_Name_brio,Car_Name_city,Car_Name_corolla altis,Car_Name_verna,Car_Name_infrequent_sklearn,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Selling_type_Dealer,Selling_type_Individual,Transmission_Automatic,Transmission_Manual,Year,Present_Price,Driven_kms,Owner
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2014,5.59,27000,0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,2013,9.54,43000,0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2017,9.85,6900,0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,2011,4.15,5200,0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2014,6.87,42450,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2017,9.40,15141,0
236,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2016,4.43,11849,0
237,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2015,13.60,68000,0
238,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2013,9.40,60241,0


In [18]:
scores = cross_val_score(LinearRegression(),x_train_, y_train,cv=3, scoring='r2')
print(f"Scores without applying any scaling technique on numeric columns: {scores}")

Scores without applying any scaling technique on numeric columns: [0.54451137 0.55566226 0.42213555]


In [20]:
preprocessor = ColumnTransformer(transformers=[
                                ("cat", OneHotEncoder(max_categories=5, handle_unknown = "ignore"),cat_cols),
                                ("num", StandardScaler(), num_cols)
])

reg_pipeline = Pipeline([
                       ('preprocessor', preprocessor),
                       ('classiifer', LinearRegression())
])

In [22]:
scores = cross_val_score(reg_pipeline,X_train, y_train, cv=3, scoring='r2')

print(scores)

[0.84644364 0.85244192 0.82253585]


In [24]:
preprocessor_r = ColumnTransformer(transformers=[
                                ("cat_", OneHotEncoder(max_categories=5, handle_unknown = "ignore"),cat_cols),
                                ("num_r", RobustScaler(), num_cols)
])

reg_pipeline_r = Pipeline([
                       ('preprocessor', preprocessor_r),
                       ('classiifer', LinearRegression())
])
reg_pipeline_r

In [26]:
scores_r = cross_val_score(reg_pipeline_r,X_train, y_train, cv=3, scoring='r2')

print(scores_r)

[0.84644364 0.85244192 0.82253585]


In [28]:
preprocessor_m = ColumnTransformer(transformers=[
                                ("cat_m", OneHotEncoder(max_categories=5, handle_unknown = "ignore"),cat_cols),
                                ("num_m", MinMaxScaler(feature_range=(0,1)), num_cols)
])

reg_pipeline_m = Pipeline([
                       ('preprocessor', preprocessor_m),
                       ('classiifer', LinearRegression())
])
reg_pipeline_m

In [30]:
scores1 = cross_val_score(reg_pipeline_m,X_train, y_train, cv=3, scoring='r2')

print(scores1)

[0.84644364 0.85244192 0.82253585]
