# 1. Importing Libraries

In [61]:
import numpy as np

import pandas as pd

from sklearn.model_selection import (
	train_test_split,
	cross_val_score
)

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import (
	StandardScaler,
	OneHotEncoder
)

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import (
	r2_score,
	make_scorer
)

import joblib

# 2. Getting the Data

In [4]:
df = pd.read_csv(
	"car-details.csv",
	usecols=[
		"company",
		"year",
		"owner",
		"fuel",
		"km_driven",
		"mileage_mpg",
		"engine_cc",
		"seats",
		"selling_price"
	]
)

df

Unnamed: 0,company,year,owner,fuel,km_driven,mileage_mpg,engine_cc,seats,selling_price
0,Maruti,2014,First,Diesel,145500,55.00,1248.0,5.0,450000
1,Skoda,2014,Second,Diesel,120000,49.70,1498.0,5.0,370000
2,Honda,2006,Third,Petrol,140000,41.60,1497.0,5.0,158000
3,Hyundai,2010,First,Diesel,127000,54.06,1396.0,5.0,225000
4,Maruti,2007,First,Petrol,120000,37.84,1298.0,5.0,130000
...,...,...,...,...,...,...,...,...,...
6921,Maruti,2013,Second,Petrol,50000,44.40,998.0,5.0,260000
6922,Hyundai,2014,Second,Diesel,80000,52.97,1396.0,5.0,475000
6923,Hyundai,2013,First,Petrol,110000,43.47,1197.0,5.0,320000
6924,Hyundai,2007,Fourth & Above,Diesel,119000,39.47,1493.0,5.0,135000


# 3. Analyzing the Data

In [5]:
df.dtypes

company           object
year               int64
owner             object
fuel              object
km_driven          int64
mileage_mpg      float64
engine_cc        float64
seats            float64
selling_price      int64
dtype: object

In [10]:
def missing_info(df_):
	na_cols = [col for col in df_.columns if df_[col].isna().any()]
	na_counts = [df_[col].isna().sum() for col in na_cols]
	na_pcts = [df_[col].isna().mean() for col in na_cols]
	
	return (
		pd
		.DataFrame(data={
			"column": na_cols,
			"count": na_counts,
			"pct": na_pcts
		})
		.sort_values(by="count", ascending=False)
		.set_index("column")
	)

In [9]:
missing_info(df)

Unnamed: 0_level_0,count,pct
column,Unnamed: 1_level_1,Unnamed: 2_level_1
mileage_mpg,208,0.030032
engine_cc,208,0.030032
seats,208,0.030032


# 4. Splitting the Data

In [12]:
X = df.drop(columns="selling_price")
y = df.selling_price.copy()

X_train, X_test, y_train, y_test = train_test_split(
	X,
	y,
	test_size=0.2,
	shuffle=True,
	random_state=42
)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5540, 8) (5540,)
(1386, 8) (1386,)


In [15]:
missing_info(X_train)

Unnamed: 0_level_0,count,pct
column,Unnamed: 1_level_1,Unnamed: 2_level_1
mileage_mpg,172,0.031047
engine_cc,172,0.031047
seats,172,0.031047


# 5. Preprocessing the Data

In [16]:
X_train.head()

Unnamed: 0,company,year,owner,fuel,km_driven,mileage_mpg,engine_cc,seats
4855,Ford,2018,First,Diesel,25000,54.06,1498.0,5.0
3143,Maruti,2010,Second,Petrol,170000,,,
4275,Hyundai,2011,Second,Diesel,75500,54.06,1396.0,5.0
2288,Maruti,2009,Second,Diesel,138000,45.34,1248.0,5.0
3914,Hyundai,2003,Second,Diesel,200000,30.78,1493.0,5.0


In [24]:
num_cols = X_train.select_dtypes(exclude="O").columns.to_list()
cat_cols = np.setdiff1d(X_train.columns, num_cols).tolist()

In [30]:
num_pipe = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="median")),
	("scaler", StandardScaler())
])

cat_pipe = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
	("num", num_pipe, num_cols),
	("cat", cat_pipe, cat_cols)
])

# 6. Training the Model

In [31]:
reg = RandomForestRegressor(random_state=42)

model = Pipeline(steps=[
	("pre", preprocessor),
	("reg", reg)
])

model.fit(X_train, y_train)

# 7. Evaluate the Model

In [33]:
y_test_pred = model.predict(X_test)

r2_score(y_test, y_test_pred)

0.8220883761156546

### 7.1 Adjusted R2

In [57]:
def adj_r2(y_test, y_pred, **kwargs):
	n, p = kwargs["data"].shape
	r2 = r2_score(y_test, y_pred)
	return 1 - ((1 - r2) * (n - 1) / (n - p - 1))

In [58]:
adj_r2(y_test, y_test_pred, data=X_train)

0.8218310459780529

In [59]:
adj_r2_score = make_scorer(
	adj_r2,
	response_method="predict",
	greater_is_better=True,
	needs_proba=False,
	data=X_train
)

### 7.2 Cross-validation Score

In [60]:
cv_scores = cross_val_score(
	model,
	X_train,
	y_train,
	scoring=adj_r2_score,
	cv=3,
	n_jobs=-1
)

np.mean(cv_scores)

0.847264077129562

# 8. Save the Model

In [62]:
joblib.dump(model, "model.joblib")

['model.joblib']