In [2]:
import pandas as pd
import  numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
#reading the data file
file_path = "source/car_price_prediction.csv"
try:
    car_df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error retrieving the file check :{file_path}")

In [None]:
#preview the dataframe
car_df.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


In [None]:
#lets drop ID column (not important)
car_df = car_df.drop(columns="ID")

In [None]:
# Drop duplicates
car_df = car_df.drop_duplicates()

In [None]:
# Print number of rows and columns and data types of each column
print(f"rows :{car_df.shape[0]}, columns :{car_df.shape[1]}")
car_df.info()

rows :15725, columns :17
<class 'pandas.core.frame.DataFrame'>
Index: 15725 entries, 0 to 19236
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             15725 non-null  int64  
 1   Levy              15725 non-null  object 
 2   Manufacturer      15725 non-null  object 
 3   Model             15725 non-null  object 
 4   Prod. year        15725 non-null  int64  
 5   Category          15725 non-null  object 
 6   Leather interior  15725 non-null  object 
 7   Fuel type         15725 non-null  object 
 8   Engine volume     15725 non-null  object 
 9   Mileage           15725 non-null  object 
 10  Cylinders         15725 non-null  float64
 11  Gear box type     15725 non-null  object 
 12  Drive wheels      15725 non-null  object 
 13  Doors             15725 non-null  object 
 14  Wheel             15725 non-null  object 
 15  Color             15725 non-null  object 
 16  Airbags           15

In [None]:
#cleaning Levy column remove " - " and convert it to numeric data type
car_df["Levy"] = car_df["Levy"].replace("-", "")
car_df["Levy"] = pd.to_numeric(car_df["Levy"], errors="coerce")

In [None]:
# remove "km" from mileage and convert it to interger 
car_df["Mileage"] = car_df["Mileage"].str.replace("km", "").astype("int64")

In [None]:
#Drop evry row with a Null value
car_df = car_df.dropna()

In [None]:
# Encode each categorical column 
car_df = pd.get_dummies(car_df, drop_first=True, dtype=int)

In [None]:
# Split training and testing data
features = car_df.drop(columns="Price").columns.tolist()
x = car_df[features]
y = car_df["Price"]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [None]:
#Predicting data using DecisionTreeRegressor
dec_tree = DecisionTreeRegressor(random_state=42)
dec_tree.fit(x_train, y_train)
dec_tree_pred = dec_tree.predict(x_test)
dec_tree_mae = mean_absolute_error(y_test, dec_tree_pred)
dec_tree_score = r2_score(y_test, dec_tree_pred)
print(f"decision tree mae:{dec_tree_mae}")
print(f"decision tree score :{dec_tree_score:0.2f}")

decision tree mae:6304.695868591339
decision tree score :0.51


In [None]:
#Predicting data using LinearRegression
linear_r = LinearRegression()
linear_r.fit(x_train, y_train)
linear_r_pred = linear_r.predict(x_test)
linear_r_mae = mean_absolute_error(y_test, linear_r_pred)
linear_r_score = r2_score(y_test, linear_r_pred)
print(f"mae for linear regressor :{linear_r_mae}")
print(f"score for linear regressor :{linear_r_score:0.2f}")

mae for linear regressor :8119.099446164946
score for linear regressor :0.50


In [None]:
#Predicting data using RandomForestRegressor
rand_f = RandomForestRegressor(random_state=42)
rand_f.fit(x_train, y_train)
rand_f_pred = rand_f.predict(x_test)
rand_f_mae = mean_absolute_error(y_test, rand_f_pred)
rand_f_score = r2_score(y_test, rand_f_pred)
print(f"mae for Random forest {rand_f_mae}")
print(f"score for Random forest {rand_f_score:0.2f}")

mae for Random forest 4765.736129727734
score for Random forest 0.73
