## 1. Import Libraries

In [1]:
import sys
import pandas as pd
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

## 2. Confirm Python Executable

In [2]:
sys.executable

'C:\\Users\\Misbah\\Anaconda3\\envs\\demo\\python.exe'

## 3. Connect to MongoDB

In [3]:
try:
	client = MongoClient("mongodb://localhost:27017/")
	db = client["cars"]
except ConnectionFailure as e:
	print(f"> Couldn't connect to MongoDB: {e}")
else:
	print("> Successfully connected to MongoDB!")
	dataset = db["car_data"]

> Successfully connected to MongoDB!


## 4. Read the Data from MongoDB

In [4]:
df = pd.DataFrame(dataset.find())
df.head()

Unnamed: 0,_id,name,company,model,edition,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,66bc52f30ea15fbe31116802,Maruti Swift Dzire VDI,Maruti,Swift,Dzire VDI,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,66bc52f30ea15fbe31116803,Skoda Rapid 1.5 TDI Ambition,Skoda,Rapid,1.5 TDI Ambition,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,66bc52f30ea15fbe31116804,Honda City 2017-2020 EXi,Honda,City,2017-2020 EXi,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,66bc52f30ea15fbe31116805,Hyundai i20 Sportz Diesel,Hyundai,i20,Sportz Diesel,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,66bc52f30ea15fbe31116806,Maruti Swift VXI BSIII,Maruti,Swift,VXI BSIII,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


## 5. Summarize the Data

In [5]:
# size of the data

df.shape

(6926, 17)

In [6]:
# data types of features

df.dtypes

_id               object
name              object
company           object
model             object
edition           object
year               int64
owner             object
fuel              object
seller_type       object
transmission      object
km_driven          int64
mileage_mpg      float64
engine_cc        float64
max_power_bhp    float64
torque_nm        float64
seats            float64
selling_price      int64
dtype: object

In [7]:
# meta-data of dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6926 entries, 0 to 6925
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   _id            6926 non-null   object 
 1   name           6926 non-null   object 
 2   company        6926 non-null   object 
 3   model          6926 non-null   object 
 4   edition        6926 non-null   object 
 5   year           6926 non-null   int64  
 6   owner          6926 non-null   object 
 7   fuel           6926 non-null   object 
 8   seller_type    6926 non-null   object 
 9   transmission   6926 non-null   object 
 10  km_driven      6926 non-null   int64  
 11  mileage_mpg    6718 non-null   float64
 12  engine_cc      6718 non-null   float64
 13  max_power_bhp  6717 non-null   float64
 14  torque_nm      6717 non-null   float64
 15  seats          6718 non-null   float64
 16  selling_price  6926 non-null   int64  
dtypes: float64(5), int64(3), object(9)
memory usage: 920

In [8]:
# missing values

na_counts = df.isna().sum()
na_pct = df.isna().sum().div(df.shape[0]).mul(100).round(2)

na_df = (
	pd
	.concat([na_counts, na_pct], axis=1)
	.set_axis(["count", "pct"], axis=1)
	.query("count > 0")
	.sort_values(by="count", ascending=False)
)

na_df

Unnamed: 0,count,pct
max_power_bhp,209,3.02
torque_nm,209,3.02
mileage_mpg,208,3.0
engine_cc,208,3.0
seats,208,3.0


In [9]:
# duplicate values

df.duplicated().sum()

0

In [10]:
# drop irrelevant columns

cols = ["_id", "name", "edition"]
df.drop(columns=cols, inplace=True)

df.head()

Unnamed: 0,company,model,year,owner,fuel,seller_type,transmission,km_driven,mileage_mpg,engine_cc,max_power_bhp,torque_nm,seats,selling_price
0,Maruti,Swift,2014,First,Diesel,Individual,Manual,145500,55.0,1248.0,74.0,190.0,5.0,450000
1,Skoda,Rapid,2014,Second,Diesel,Individual,Manual,120000,49.7,1498.0,103.52,250.0,5.0,370000
2,Honda,City,2006,Third,Petrol,Individual,Manual,140000,41.6,1497.0,78.0,124.544455,5.0,158000
3,Hyundai,i20,2010,First,Diesel,Individual,Manual,127000,54.06,1396.0,90.0,219.66896,5.0,225000
4,Maruti,Swift,2007,First,Petrol,Individual,Manual,120000,37.84,1298.0,88.2,112.776475,5.0,130000


## 6. Split the Data

In [13]:
X = df.drop(columns="selling_price").assign(model=df.model.astype(str))
y = df.selling_price.copy()

print(X.shape, y.shape)

(6926, 13) (6926,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(5540, 13) (5540,)
(1386, 13) (1386,)


## 7. Preprocessor

In [15]:
num_cols = X_train.select_dtypes(include="number").columns

num_pipe = Pipeline([
	("imputer", SimpleImputer(strategy="median")),
	("scaler", StandardScaler())
])

In [16]:
cat_cols = [col for col in X_train.columns if col not in num_cols]

cat_pipe = Pipeline([
	("imputer", SimpleImputer(strategy="most_frequent")),
	("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [17]:
preprocessor = ColumnTransformer(transformers=[
	("num", num_pipe, num_cols),
	("cat", cat_pipe, cat_cols)
])

preprocessor

## 8. Train Models

In [18]:
algos = (
	("Linear Regression", LinearRegression()),
	("Ridge", Ridge()),
	("Lasso", Lasso()),
	("SVM", SVR()),
	("Random Forest", RandomForestRegressor(n_estimators=20, max_depth=5)),
	("XGBOOST", XGBRegressor(n_estimators=20, max_depth=5))
)

In [19]:
for alg, reg  in algos:
	# setup the regressor with preprocessor
	model = Pipeline([
		("pre", preprocessor),
		("alg", reg)
	])
	
	# train the model
	model.fit(X_train, y_train)
	print(f"> Trained {alg}")

	# evaluate the model
	y_pred = model.predict(X_test)
	rmse = mean_squared_error(y_test, y_pred, squared=False)
	print(f"> RMSE: {rmse:,.2f}")

	# insert data into MongoDB
	model_results = db["model_results"]
	result = {"model": alg, "RMSE": rmse}
	model_results.insert_one(result)
	print("> Inserted data into MongoDB")
	
	print()

> Trained Linear Regression
> RMSE: 228,102.70
> Inserted data into MongoDB

> Trained Ridge
> RMSE: 223,726.20
> Inserted data into MongoDB



  model = cd_fast.sparse_enet_coordinate_descent(


> Trained Lasso
> RMSE: 240,498.66
> Inserted data into MongoDB

> Trained SVM
> RMSE: 478,788.42
> Inserted data into MongoDB

> Trained Random Forest
> RMSE: 171,400.43
> Inserted data into MongoDB

> Trained XGBOOST
> RMSE: 142,107.63
> Inserted data into MongoDB

