## 1- importing libraries & Data understanding

In [353]:
import numpy as np
import pandas as pd 
import plotly.express as px
import pickle

In [354]:
df = pd.read_csv('Cars.csv')

In [355]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Brand,Model,Body,Color,Year,Fuel,Kilometers,Engine,Transmission,Price,Gov
0,5337,Hyundai,Accent,Sedan,Black,2007,Benzine,140000 to 159999,1600 CC,Automatic,140.0,Giza
1,5338,Hyundai,Accent,Sedan,Silver,2005,Benzine,180000 to 199999,1000 - 1300 CC,Manual,78.0,Qena
2,5339,Hyundai,Accent,Sedan,Gray,1999,Benzine,140000 to 159999,1400 - 1500 CC,Manual,70.0,Giza


##### - The data is about used cars price in egypt. we have 10 features in our data (categorical & numerical) affect the target value (continous numerical value "Price"). So we have regression case here.
##### - we will start with cleaning and preparing our data for using in model.

In [356]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14741 entries, 0 to 14740
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    14741 non-null  int64  
 1   Brand         14741 non-null  object 
 2   Model         14741 non-null  object 
 3   Body          14741 non-null  object 
 4   Color         14741 non-null  object 
 5   Year          14741 non-null  int64  
 6   Fuel          14741 non-null  object 
 7   Kilometers    14741 non-null  object 
 8   Engine        14741 non-null  object 
 9   Transmission  14741 non-null  object 
 10  Price         14741 non-null  float64
 11  Gov           14741 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 1.3+ MB


##### - From above cell, I recommend to (drop "Unnamed" column, transform "Kilometers, Engine" to num type).
##### - Also i see it's better to express the Year in another form, let's start ...

## 2- Data Cleaning & Preprocessing

In [357]:
df.Model.value_counts() # see value counts helps in deciding the type of encoding method you will use (one hot, binary)

128        2425
Verna      1903
Elantra    1529
Lanos      1342
Accent     1272
Optra      1252
Shahin     1142
Aveo        994
131         572
Cruze       428
Uno         350
Avante      282
Tipo        274
Punto       270
Matrix      268
Tucson      182
I10         166
Excel        90
Name: Model, dtype: int64

In [358]:
df.Engine.value_counts() # see type of entries to provide in mapping

1600 CC           6762
1400 - 1500 CC    4356
1000 - 1300 CC    3623
Name: Engine, dtype: int64

#### 2.1 Drop unnecessary columns

In [359]:
df.drop(columns= 'Unnamed: 0', inplace= True) # this column doesn't have any useful information

In [360]:
df.head(2)

Unnamed: 0,Brand,Model,Body,Color,Year,Fuel,Kilometers,Engine,Transmission,Price,Gov
0,Hyundai,Accent,Sedan,Black,2007,Benzine,140000 to 159999,1600 CC,Automatic,140.0,Giza
1,Hyundai,Accent,Sedan,Silver,2005,Benzine,180000 to 199999,1000 - 1300 CC,Manual,78.0,Qena


#### 2.2 Check Duplicates & Null values

In [361]:
df.duplicated().sum() # it seems there is no duplicated rows (1 entry mayn't an error)

1

In [362]:
df.isnull().sum() # No Null values

Brand           0
Model           0
Body            0
Color           0
Year            0
Fuel            0
Kilometers      0
Engine          0
Transmission    0
Price           0
Gov             0
dtype: int64

#### 2.3 Type transformation for some Columns

In [363]:
df.info() # I think Kilometers & Engine needed to be transformed 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14741 entries, 0 to 14740
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         14741 non-null  object 
 1   Model         14741 non-null  object 
 2   Body          14741 non-null  object 
 3   Color         14741 non-null  object 
 4   Year          14741 non-null  int64  
 5   Fuel          14741 non-null  object 
 6   Kilometers    14741 non-null  object 
 7   Engine        14741 non-null  object 
 8   Transmission  14741 non-null  object 
 9   Price         14741 non-null  float64
 10  Gov           14741 non-null  object 
dtypes: float64(1), int64(1), object(9)
memory usage: 1.2+ MB


In [364]:
df.Kilometers.value_counts() # we know when kilometers increases, the value of the car goes down so we will transform it into ordinal numbers

More than 200000    2505
10000 to 19999      1666
180000 to 199999    1349
100000 to 119999    1192
0 to 9999           1088
140000 to 159999    1064
120000 to 139999    1005
90000 to 99999       996
160000 to 179999     760
20000 to 29999       612
80000 to 89999       560
50000 to 59999       436
60000 to 69999       402
40000 to 49999       372
30000 to 39999       370
70000 to 79999       364
Name: Kilometers, dtype: int64

In [365]:
kilometer_map = {'0 to 9999': 1, '10000 to 19999': 2, '100000 to 119999': 3, '20000 to 29999': 4, '30000 to 39999': 5,
                 '40000 to 49999': 6, '50000 to 59999': 7, '60000 to 69999': 8, '70000 to 79999': 9,  '80000 to 89999': 10, 
                   '90000 to 99999': 11,  '100000 to 119999': 12,  '120000 to 139999': 13,  '140000 to 159999': 14,
                   '160000 to 179999': 15,  '180000 to 199999': 16, 'More than 200000': 17
                 }

In [366]:
df.Kilometers = df.Kilometers.map(kilometer_map)

In [367]:
engine_map = {'1600 CC': 1, '1400 - 1500 CC': 2, '1000 - 1300 CC': 3} # when "CC" increases, the value increases also.

In [368]:
df.Engine = df.Engine.map(engine_map)

In [369]:
df['Price'] = df['Price'] * 1000

In [370]:
df.Price.dtype

dtype('float64')

In [371]:
df.corr() # the negative sign between (Price & Kilometers, Price & Engine size) confirms the nature of relation we said above.

Unnamed: 0,Year,Kilometers,Engine,Price
Year,1.0,-0.206538,-0.58339,0.774288
Kilometers,-0.206538,1.0,0.191459,-0.18787
Engine,-0.58339,0.191459,1.0,-0.6053
Price,0.774288,-0.18787,-0.6053,1.0


In [372]:
# df['Price'] = df['Price'].apply(lambda x: "{:,.0f}".format)

In [373]:
# df.Price = df.Price.map('{:,.0f}'.format) # to make price more readable

In [374]:
# df.Price.dtype

#### 2.4 Feature Engineering (Using Age instead of Year for ML Algorithm) 

In [375]:
from datetime import datetime

In [376]:
current_year = datetime.today().year # saving current year in variable

In [377]:
df['Age'] = current_year - df['Year'] # creat a new column has the same information of "Year" but in better way

In [378]:
df.head(2)

Unnamed: 0,Brand,Model,Body,Color,Year,Fuel,Kilometers,Engine,Transmission,Price,Gov,Age
0,Hyundai,Accent,Sedan,Black,2007,Benzine,14,1,Automatic,140000.0,Giza,16
1,Hyundai,Accent,Sedan,Silver,2005,Benzine,16,3,Manual,78000.0,Qena,18


In [379]:
df.corr() # As we expect, when the Age goes up the price goes down. (0.77) says there is a strong correlation between "Age & target"

Unnamed: 0,Year,Kilometers,Engine,Price,Age
Year,1.0,-0.206538,-0.58339,0.774288,-1.0
Kilometers,-0.206538,1.0,0.191459,-0.18787,0.206538
Engine,-0.58339,0.191459,1.0,-0.6053,0.58339
Price,0.774288,-0.18787,-0.6053,1.0,-0.774288
Age,-1.0,0.206538,0.58339,-0.774288,1.0


In [380]:
bar_data = df.groupby('Model')['Price'].mean().reset_index().sort_values('Price', ascending = True) # simple exploration for average price gor each model
fig_bar = px.bar(bar_data, x= 'Price', y= 'Model') 

In [381]:
fig_bar.show()

In [382]:
df[(df['Model'] == 'Aveo') & (df['Year'] == 2011)]['Price'].mean() # you can choose the model and year you want to explore its avearge price

123344.0

## 3- Preprocessing

#### 3.1 Drop "Year" column (use "Age" instead) 

In [383]:
df.drop(columns= ['Year'], inplace= True) # we don't need it any more

In [384]:
# df.to_csv('Cars_cleaned.csv')

#### 3.2 Providing categorical columns

In [385]:
cat_cols = list(df.select_dtypes(include= 'O').columns) # select categorical columns & saving in list
cat_cols 

['Brand', 'Model', 'Body', 'Color', 'Fuel', 'Transmission', 'Gov']

#### 3.3 Providing numerical columns

In [386]:
num_cols = list(df.select_dtypes(include= 'number').columns) # select numerical columns & saving in list
num_cols.remove('Price') # we don't need to scale the target 
num_cols

['Kilometers', 'Engine', 'Age']

In [387]:
# df = pd.get_dummies(df, columns=cat_cols, drop_first=True) # one hot encoder using pandas

#### 3.4 Creating the transformer

##### - The transformer is a black box contains all functions you will apply to entries to extract data in a form that your model can deal with.
##### - So it's a group of functions called in the main function which returns the transformer (an object has its methods we should provide) 

In [388]:
from sklearn.preprocessing import OneHotEncoder # library to encode categorical columns
from sklearn.preprocessing import StandardScaler # library to scale numerical columns 
from sklearn.compose import make_column_transformer # library to save our transformer

##### - Here we have just 2 functions which will be created & called in the main transformer function. 
##### - Note that: you can create the functions out of the main function and then call them in the main function (this strategy will be better in case of more than 2 functions & with more complex functions). 

In [389]:
# The main transformer function
def preprocess(X_train, cat_cols, num_cols):
    '''
    Transformes the entries in a form that a model can deal with.
    Scales the numerical values, and encodes the categorical ones.

    Args:
        X_train (data frame, pandas): Dataframe that transformer should learn from.
        cat_cols (list): list of categorical features.
        num_cols (list): list of numerical features.

    Returns:
        transformer (object): transformer 

    '''
    sc = StandardScaler() # 1st function
    ohe = OneHotEncoder(sparse= False, drop= 'first') # 2nd function
    transformer = make_column_transformer((ohe,cat_cols),(sc,num_cols),remainder= 'passthrough',verbose_feature_names_out=False) # saving instance from transformer with methods and provide columns to apply in
    transformer.fit(X_train[cat_cols+num_cols]) # fitting the transformer
    return transformer

#### 3.5 Splitting the data

In [390]:
from sklearn.model_selection import train_test_split

In [391]:
X = df.drop('Price', axis= 1)
y = df.Price

In [392]:
X_train, X_test, y_train, y_test = train_test_split(X ,y ,test_size= 0.2, random_state= 10)
print('X_train Size: ', X_train.shape)
print('y_train Size: ', y_train.shape)
print('X_test Size: ', X_test.shape)
print('y_test Size: ', y_test.shape)

X_train Size:  (11792, 10)
y_train Size:  (11792,)
X_test Size:  (2949, 10)
y_test Size:  (2949,)


In [393]:
X_train.head(1)

Unnamed: 0,Brand,Model,Body,Color,Fuel,Kilometers,Engine,Transmission,Gov,Age
143,Hyundai,Accent,Sedan,Blue- Navy Blue,Benzine,15,1,Automatic,Fayoum,11


In [394]:
X_test.iloc[10] # numpy array

Brand             Hyundai
Model               Verna
Body                Sedan
Color                Gray
Fuel              Benzine
Kilometers              1
Engine                  1
Transmission    Automatic
Gov                  Giza
Age                     6
Name: 2198, dtype: object

In [395]:
tst_smpl = pd.DataFrame(X_test.iloc[200]).T # here it's a dataframe, saving sample to use it in testing transformer
tst_smpl

Unnamed: 0,Brand,Model,Body,Color,Fuel,Kilometers,Engine,Transmission,Gov,Age
11364,Fiat,Shahin,Sedan,Gray,Benzine,10,2,Manual,Sohag,16


In [396]:
tst_smpl_y = y_test.iloc[200]
print(tst_smpl_y)
print(tst_smpl_y.dtype)

60000.0
float64


In [397]:
test_sample = pd.DataFrame(X_test.iloc[10]).T # saving sample to use it in testing transformer
test_sample

Unnamed: 0,Brand,Model,Body,Color,Fuel,Kilometers,Engine,Transmission,Gov,Age
2198,Hyundai,Verna,Sedan,Gray,Benzine,1,1,Automatic,Giza,6


In [398]:
y_test.iloc[10]

170000.0

In [399]:
test_sample_y = pd.DataFrame({'Price': y_test.iloc[10]}, index= [10]) # saving sample to use it in testing transformer
test_sample_y

Unnamed: 0,Price
10,170000.0


In [400]:
transformer = preprocess(X_train, cat_cols, num_cols) # here we save the transformer fitted from X_train and ready to use
print(transformer)

ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder',
                                 OneHotEncoder(drop='first', sparse=False),
                                 ['Brand', 'Model', 'Body', 'Color', 'Fuel',
                                  'Transmission', 'Gov']),
                                ('standardscaler', StandardScaler(),
                                 ['Kilometers', 'Engine', 'Age'])],
                  verbose_feature_names_out=False)


In [401]:
X_train = transformer.transform(X_train[cat_cols+num_cols]) # Apply transformation to training data
X_test = transformer.transform(X_test[cat_cols+num_cols]) # Apply transformation to test data

In [402]:
test_sample = transformer.transform(test_sample[cat_cols+num_cols]) # test the transformer in the sample data 
test_sample # we see here the transformer takes a dataframe and returns an array 

array([[ 0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -1.71507056, -0.96827602, -0.9069594 ]])

In [403]:
# Examine the shape of data after transformation
print('X_train Size: ', X_train.shape)
print('y_train Size: ', y_train.shape)
print('X_test Size: ', X_test.shape)
print('y_test Size: ', y_test.shape)

X_train Size:  (11792, 64)
y_train Size:  (11792,)
X_test Size:  (2949, 64)
y_test Size:  (2949,)


## 4- Modeling & Validation

#### The next step after preparing data is (Modeling & Validation).
#### For validation, we will use cross validation method.
#### Also we will compare between many models to choose the best one based on r2 value.
#### Let's start ...

In [404]:
from sklearn.model_selection import KFold, cross_val_score # for validation

from sklearn.linear_model import LinearRegression # for modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score # for accuracy measuring

In [405]:
k_folds = KFold(n_splits = 5) # Providing the number of folds

In [406]:
# Saving models
linear_regression = LinearRegression()
random_forest = RandomForestRegressor()
knn = KNeighborsRegressor()
decision_tree = DecisionTreeRegressor()
xgboost = XGBRegressor()
models = {
          'LinearRegression': linear_regression,
          'RandomForestRegressor': random_forest,
          'knn': knn,
          'DecisionTreeRegressor': decision_tree,
          'XGBRegressor': xgboost
          }

#### Below we will build a function to :-  
#### 1- Store cross validation scores in a dataframe (to check if the validation done successfully or not).
#### 2- Store accuarcy readings in a dataframe (to compare between models and choose the best one).
#### Hint that: The reason for storing each data in dataframe is only for make it easy to read.  

In [407]:
# Testing different models
def models_compar(X_train, y_train, models, k_folds):
    '''
    Computes accuarcies & validation scores for different models

    Args:
        X_train (numpy array): Training data features.
        y_train (numpy array): Training data target.
        models (dictionary): different models we compare among.
        k_folds (class sklearn.model_selection.KFold(n_splits= ..)) : Nu. of folds used in cross validation. 

    Returns:
        scores_cv_df (pd.DataFrame) : Cross validation scores.
        models_df (pd.DataFrame) : Accuracy for each model.
        
    '''
    scores_avg_cv = []
    scores_cv = {}
    train_acc = []
    test_acc = []
    r2 = []
    for model_name, model in models.items():
        scores = cross_val_score(model, X_train, y_train, cv = k_folds)
        scores_cv[model_name] = scores
        scores_avg_cv.append(scores.mean())
        model.fit(X_train,y_train)
        train_acc.append(model.score(X_train, y_train))
        test_acc.append(model.score(X_test, y_test))
        y_pred = model.predict(X_test)
        r2.append(r2_score(y_test,y_pred))

    scores_cv_df = pd.DataFrame(scores_cv)  
    models_df = pd.DataFrame(data= [scores_avg_cv,train_acc,test_acc,r2], index= ['scores_avg_cv','train_acc','test_acc','r2'], columns= models.keys())

    return  scores_cv_df, models_df


In [408]:
scores_cv_df, models_df = models_compar(X_train, y_train, models, k_folds) 

In [409]:
scores_cv_df

Unnamed: 0,LinearRegression,RandomForestRegressor,knn,DecisionTreeRegressor,XGBRegressor
0,0.841143,0.905761,0.860272,0.856714,0.91381
1,0.856072,0.921308,0.878006,0.888789,0.931114
2,0.852506,0.909129,0.865626,0.854819,0.920892
3,0.849793,0.908071,0.881457,0.871219,0.926377
4,0.846449,0.911153,0.861691,0.855568,0.919099


#### Above, we see the 5 validation scores for each model are almost the same value.
#### Validation done successfully ...

In [410]:
models_df

Unnamed: 0,LinearRegression,RandomForestRegressor,knn,DecisionTreeRegressor,XGBRegressor
scores_avg_cv,0.849193,0.911085,0.869411,0.865422,0.922258
train_acc,0.851441,0.979769,0.9221,0.987791,0.960688
test_acc,0.839226,0.908931,0.860142,0.879067,0.911683
r2,0.839226,0.908931,0.860142,0.879067,0.911683


#### Both (XGBRegressor & RandomForestRegressor) gave almost the same r2 value "91.0 %".
#### I'll choose "RandomForestRegressor"

## 5- Tuning the Model

#### Before creating the model, we need to provide best values for its parameters.
#### In RandomForest we have 2 important parameters (n_estimators & max_depth).
#### Below, We used GridSearch method to provide best values.

In [411]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [7,8,9,10,11,12]}

grid = GridSearchCV(random_forest, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': [7, 8, 9, 10, 11, 12],
                         'n_estimators': [50, 100, 200]},
             scoring='r2')

In [412]:
print(grid.best_score_)
print(grid.best_params_)

0.9145485712818984
{'max_depth': 12, 'n_estimators': 200}


In [413]:
random_forest_best = RandomForestRegressor(n_estimators= grid.best_params_['n_estimators'], max_depth= grid.best_params_['max_depth'], random_state= 10) # creating the model using best values specified from gridsearch

In [414]:
random_forest_best.fit(X_train,y_train) # fitting the model

RandomForestRegressor(max_depth=12, n_estimators=200, random_state=10)

In [415]:
print('Training Accuracy: ', random_forest_best.score(X_train, y_train))
print('Test Accuracy: ', random_forest_best.score(X_test, y_test))

Training Accuracy:  0.954269983973256
Test Accuracy:  0.9087887034795815


In [416]:
print('Prediction Test: ', random_forest_best.predict(test_sample)[0]) # sample for test
print('Actual Value: ', y_test.iloc[10])

Prediction Test:  178339.9290053449
Actual Value:  170000.0


In [417]:
y_pred = random_forest_best.predict(X_test)

In [418]:
print(r2_score(y_test,y_pred))

0.9087887034795815


### Congratulations !! ... Now, you have a model with about 91.0 % accuracy. 

## 6- Transformer saving

In [419]:
# pickle.dump(transformer, open('transf.pkl', 'wb'))

## 7- Model saving

In [420]:
# pickle.dump(random_forest_best, open('RF.pkl', 'wb'))

## 8- Inference

In [421]:
t = pickle.load(open('transf.pkl', 'rb'))
m = pickle.load(open('RF.pkl', 'rb'))

In [422]:
tst_smpl = t.transform(tst_smpl[cat_cols+num_cols])

In [423]:
print('Prediction Test: ', m.predict(tst_smpl)[0])
print('Actual Value: ', tst_smpl_y)

Prediction Test:  54486.19741976683
Actual Value:  60000.0


### Project Completed.