<a href="https://colab.research.google.com/github/Galih188/DataMining/blob/main/Prediction_Car_Price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import Library**

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [25]:
data_set = '/content/car_price.csv'
df = pd.read_csv(data_set)

In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
0,0,Jeep Compass 2.0 Longitude Option BSIV,10.03 Lakh,"86,226 kms",Diesel,Manual,1st Owner,2017,1956 cc,5 Seats
1,1,Renault Duster RXZ Turbo CVT,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats
2,2,Toyota Camry 2.5 G,16.40 Lakh,"60,343 kms",Petrol,Automatic,1st Owner,2016,2494 cc,5 Seats
3,3,Honda Jazz VX CVT,7.77 Lakh,"26,696 kms",Petrol,Automatic,1st Owner,2018,1199 cc,5 Seats
4,4,Volkswagen Polo 1.2 MPI Highline,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats


In [27]:
df[df.duplicated()].count()

Unnamed: 0,0
Unnamed: 0,0
car_name,0
car_prices_in_rupee,0
kms_driven,0
fuel_type,0
transmission,0
ownership,0
manufacture,0
engine,0
Seats,0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5512 entries, 0 to 5511
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           5512 non-null   int64 
 1   car_name             5512 non-null   object
 2   car_prices_in_rupee  5512 non-null   object
 3   kms_driven           5512 non-null   object
 4   fuel_type            5512 non-null   object
 5   transmission         5512 non-null   object
 6   ownership            5512 non-null   object
 7   manufacture          5512 non-null   int64 
 8   engine               5512 non-null   object
 9   Seats                5512 non-null   object
dtypes: int64(2), object(8)
memory usage: 430.8+ KB


In [29]:
df.describe()

Unnamed: 0.1,Unnamed: 0,manufacture
count,5512.0,5512.0
mean,2755.5,2015.455552
std,1591.321673,3.927974
min,0.0,1995.0
25%,1377.75,2013.0
50%,2755.5,2016.0
75%,4133.25,2018.0
max,5511.0,2022.0


In [30]:
df[df.isna()].count()

Unnamed: 0,0
Unnamed: 0,0
car_name,0
car_prices_in_rupee,0
kms_driven,0
fuel_type,0
transmission,0
ownership,0
manufacture,0
engine,0
Seats,0


# **Preprocessing Data**

In [31]:
df = df.drop(columns='Unnamed: 0', axis=1)

In [32]:
la_enc = LabelEncoder()
df['car_name'] = la_enc.fit_transform(df['car_name'])

In [33]:
df = pd.get_dummies(df, columns=['fuel_type', 'transmission'], drop_first = True)

In [34]:
df.head()

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,ownership,manufacture,engine,Seats,fuel_type_Diesel,fuel_type_Electric,fuel_type_Lpg,fuel_type_Petrol,transmission_Manual
0,743,10.03 Lakh,"86,226 kms",1st Owner,2017,1956 cc,5 Seats,True,False,False,False,True
1,1444,12.83 Lakh,"13,248 kms",1st Owner,2021,1330 cc,5 Seats,False,False,False,True,False
2,1706,16.40 Lakh,"60,343 kms",1st Owner,2016,2494 cc,5 Seats,False,False,False,True,False
3,385,7.77 Lakh,"26,696 kms",1st Owner,2018,1199 cc,5 Seats,False,False,False,True,False
4,1831,5.15 Lakh,"69,414 kms",1st Owner,2016,1199 cc,5 Seats,False,False,False,True,True


Mengubah nama kolom

In [35]:
df['price'] = df['car_prices_in_rupee']
del df['car_prices_in_rupee']
df['kms'] = df['kms_driven']
del df['kms_driven']

Menghapus Lakh dan Crore dari kolomnya, membuat kolom baru Currency

In [36]:
df.loc[df['price'].str.contains('Lakh'), 'Currency'] = 'Lakh'
df.loc[df['price'].str.contains('Crore'), 'Currency'] = 'Crore'
df = df.dropna()

In [37]:
df.head()

Unnamed: 0,car_name,ownership,manufacture,engine,Seats,fuel_type_Diesel,fuel_type_Electric,fuel_type_Lpg,fuel_type_Petrol,transmission_Manual,price,kms,Currency
0,743,1st Owner,2017,1956 cc,5 Seats,True,False,False,False,True,10.03 Lakh,"86,226 kms",Lakh
1,1444,1st Owner,2021,1330 cc,5 Seats,False,False,False,True,False,12.83 Lakh,"13,248 kms",Lakh
2,1706,1st Owner,2016,2494 cc,5 Seats,False,False,False,True,False,16.40 Lakh,"60,343 kms",Lakh
3,385,1st Owner,2018,1199 cc,5 Seats,False,False,False,True,False,7.77 Lakh,"26,696 kms",Lakh
4,1831,1st Owner,2016,1199 cc,5 Seats,False,False,False,True,True,5.15 Lakh,"69,414 kms",Lakh


Menghapus teks pada nilai numerik

In [None]:
df['ownership'] = df['ownership'].str.replace('st Owner', '')
df['ownership'] = df['ownership'].str.replace('nd Owner', '')
df['ownership'] = df['ownership'].str.replace('rd Owner', '')
df['ownership'] = df['ownership'].str.replace('th Owner', '')
df['engine'] = df['engine'].str.replace(' cc', '')
df['Seats'] = df['Seats'].str.replace(' Seats', '')
df['price'] = df['price'].str.replace(' Lakh', '')
df['price'] = df['price'].str.replace(' Crore', '')
df['kms'] = df['kms'].str.replace(' kms', '')

In [39]:
df.head()

Unnamed: 0,car_name,ownership,manufacture,engine,Seats,fuel_type_Diesel,fuel_type_Electric,fuel_type_Lpg,fuel_type_Petrol,transmission_Manual,price,kms,Currency
0,743,1,2017,1956,5,True,False,False,False,True,10.03,86226,Lakh
1,1444,1,2021,1330,5,False,False,False,True,False,12.83,13248,Lakh
2,1706,1,2016,2494,5,False,False,False,True,False,16.4,60343,Lakh
3,385,1,2018,1199,5,False,False,False,True,False,7.77,26696,Lakh
4,1831,1,2016,1199,5,False,False,False,True,True,5.15,69414,Lakh


Mengkonversi kategorikal ke numrik

In [40]:
df['ownership'] = pd.to_numeric(df['ownership'])
df['engine'] = pd.to_numeric(df['engine'])
df['Seats'] = pd.to_numeric(df['Seats'])
df['kms'] = df['kms'].str.replace(',', '')
df['kms'] = pd.to_numeric(df['kms'])
df['price'] = df['price'].astype('float')

In [41]:
df.head()

Unnamed: 0,car_name,ownership,manufacture,engine,Seats,fuel_type_Diesel,fuel_type_Electric,fuel_type_Lpg,fuel_type_Petrol,transmission_Manual,price,kms,Currency
0,743,1,2017,1956,5,True,False,False,False,True,10.03,86226,Lakh
1,1444,1,2021,1330,5,False,False,False,True,False,12.83,13248,Lakh
2,1706,1,2016,2494,5,False,False,False,True,False,16.4,60343,Lakh
3,385,1,2018,1199,5,False,False,False,True,False,7.77,26696,Lakh
4,1831,1,2016,1199,5,False,False,False,True,True,5.15,69414,Lakh


In [42]:
df = df.drop('Currency', axis = 1)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5389 entries, 0 to 5511
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   car_name             5389 non-null   int64  
 1   ownership            5389 non-null   int64  
 2   manufacture          5389 non-null   int64  
 3   engine               5389 non-null   int64  
 4   Seats                5389 non-null   int64  
 5   fuel_type_Diesel     5389 non-null   bool   
 6   fuel_type_Electric   5389 non-null   bool   
 7   fuel_type_Lpg        5389 non-null   bool   
 8   fuel_type_Petrol     5389 non-null   bool   
 9   transmission_Manual  5389 non-null   bool   
 10  price                5389 non-null   float64
 11  kms                  5389 non-null   int64  
dtypes: bool(5), float64(1), int64(6)
memory usage: 363.1 KB


**Model Machine Learning**

*   X = feature
*   Y = predicting value



In [44]:
y = df['price']
x = df.drop('price', axis = 1)

Cross validation untuk menemukan model terbaik dari Linear Regression, Xgbboost Regressor, Random Forest Regressor

In [50]:
models = [LinearRegression(), xgb.XGBRegressor(), RandomForestRegressor(n_estimators=1000, max_depth=50, random_state=35)]
for model in models:
  score = cross_val_score(model, x, y, cv=5, scoring = 'r2')
  print(f"skor {model} : {score}")

skor LinearRegression() : [0.43460144 0.42342745 0.40277693 0.44912042 0.47706178]
skor XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...) : [0.79917505 0.78350938 0.80684729 0.90308989 0.89501059]
skor RandomForestRegressor(max_depth=50, n_estimators=1000, random_state=35) : [0.80182513 0.77211016 0.82

Hasil dari Cross validation yang terbaik adalah Random Forest Regressor dengan hasil 0.90, jadi saya melanjutkan dengan model tsb

Melakukan optimasi hyperparameter dengan GridCV untuk memperoleh parameter terbaik

In [51]:
grid_param = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 8, 12],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

model = RandomForestRegressor()

grid_search = GridSearchCV(model, grid_param, cv=5)
grid_search.fit(x, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Parameter terbaik: ", best_params)
print("Skor terbaik: ", best_score)

  _data = np.array(data, dtype=dtype, copy=copy,


Parameter terbaik:  {'max_depth': 12, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Skor terbaik:  0.8385909826486329


In [52]:
randFor =  RandomForestRegressor(n_estimators=100, max_depth=12, min_samples_leaf=1, min_samples_split=2)
randFor.fit(x, y)
score = randFor.score(x, y)
print(f"skor: {score}")

skor: 0.9723049037889256


In [53]:
df.head()

Unnamed: 0,car_name,ownership,manufacture,engine,Seats,fuel_type_Diesel,fuel_type_Electric,fuel_type_Lpg,fuel_type_Petrol,transmission_Manual,price,kms
0,743,1,2017,1956,5,True,False,False,False,True,10.03,86226
1,1444,1,2021,1330,5,False,False,False,True,False,12.83,13248
2,1706,1,2016,2494,5,False,False,False,True,False,16.4,60343
3,385,1,2018,1199,5,False,False,False,True,False,7.77,26696
4,1831,1,2016,1199,5,False,False,False,True,True,5.15,69414


Sample prediksi dari element pertama

In [54]:
prediksi = [[743, 1, 2017, 1956, 5, 1, 0, 0, 0, 1, 86226]]
randFor.predict(prediksi)



array([9.94631168])

Melakukan evaluasi terhadap kinerja model Random Forest, dengan membandingkan nilai aktual dan nilai prediksi, serta menghitung metrik kesalahan seperti MSE dan MAPE

In [55]:
df_error = pd.DataFrame()
df_error['Values'] = y
df_error['Predictions'] = randFor.predict(x)
df_error['MSE'] = mean_squared_error(y, randFor.predict(x))
df_error['Percentages'] = mean_absolute_percentage_error(y, randFor.predict(x))
df_error.head()

Unnamed: 0,Values,Predictions,MSE,Percentages
0,10.03,9.946312,5.132014,0.183001
1,12.83,10.249456,5.132014,0.183001
2,16.4,14.727515,5.132014,0.183001
3,7.77,9.223392,5.132014,0.183001
4,5.15,5.31066,5.132014,0.183001
