# Build Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


## Prepare data

### Read data and create a column `Year` to help increase a feature for training model.

add a column `Year` - distance from First Registration's Year of the car to now.

In [13]:

data = pd.read_csv("data\cleaned_data_edited.csv",index_col="ID")

data['Year'] = data['First registration'].str[-4:].astype(int)
data['Year'] = 2024 - data['Year']
data.columns


Index(['CARNAME', 'Make', 'Model', 'Body color', 'Interior color',
       'Interior material', 'Body', 'Doors', 'Seats', 'Fuel', 'Transmission',
       'Drive type', 'Power(kW)', 'CO2 emissions(g/km)', 'Emission class',
       'Mileage(km)', 'First registration', 'Condition',
       'Consumption(l/100km or kWh/100km)', 'Price(EUR)', 'Tags',
       'Engine capacity(ccm)', 'Previous owners', 'Year'],
      dtype='object')

### Find object columns and low cardinality columns

Due to our data have many categorical columns, we have to do one hot encoding for training model. Before handling one-hot, we have to:
- Find categorical (object) columns.
- Find columns that have low cardinality.

Why have to find columns that have low cardinality?
- For large datasets with many rows, one-hot encoding can greatly expand the size of the dataset.  For this reason, we typically will only one-hot encode columns with relatively low cardinality.  Then, high cardinality columns can either be dropped from the dataset, or we can use ordinal encoding.

In [3]:
s = (data.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

low_cardinality_cols = [col for col in object_cols if data[col].nunique() < 10]
# low_cardinality_cols.append("Make")
print("Low cardinality col:")
print (low_cardinality_cols)

Categorical variables:
['CARNAME', 'Make', 'Model', 'Body color', 'Interior color', 'Interior material', 'Body', 'Doors', 'Fuel', 'Transmission', 'Drive type', 'Emission class', 'First registration', 'Condition', 'Tags']
Low cardinality col:
['Interior color', 'Interior material', 'Doors', 'Fuel', 'Transmission', 'Drive type', 'Emission class', 'Condition']


### Find tags


- Due to the column `Tags` is a multiple value column, we have to choose which tag (special function) to do one hot encoding and put to the model 

In [4]:

one_hot_df =  data[["Price(EUR)","Tags"]].copy()

# Get each tag in each multiple value row
tags = one_hot_df['Tags'].str.split('; ', expand=True)

# Stack to make the DataFrame long, then get_dummies and group by index before summing
get_dummy = pd.get_dummies(tags.stack()).groupby(level=0).sum()

# Join the one-hot encoded DataFrame back to the original DataFrame
one_hot_df = one_hot_df.join(get_dummy)

# Drop the `Tags` because we dont need it anymore
one_hot_df = one_hot_df.drop("Tags",axis=1)

# Calculate correlation with 'Price(EUR)'
correlation = one_hot_df.corr()['Price(EUR)']

# Get the top 10 tags with highest correlation with 'Price(EUR)'
top_10_corr_tags = correlation.nlargest(9).index

# Select only the top 10 tags with highest correlation with 'Price(EUR)'
one_hot_df = one_hot_df[top_10_corr_tags].drop(["Price(EUR)"],axis=1)

one_hot_df.columns


Index(['Air suspension', 'Ventilated front seats',
       'Electric adjustable front seats', 'Digital cockpit', 'Burmester audio',
       'Heated rear seats', 'Laser headlights', 'Adaptive cruise control'],
      dtype='object')

### Find top 8  Car Manufacturer

In [16]:
one_hot = pd.get_dummies(data['Make'])
data_encoded = data[['Price(EUR)']].join(one_hot) 
correlation = data_encoded.corr()['Price(EUR)']
top_10_Make = correlation.nlargest(11).index
top_10_Make


Index(['Price(EUR)', 'Ferrari', 'Rolls-Royce', 'Lamborghini', 'Porsche',
       'Mercedes-Benz', 'BMW', 'Aston Martin', 'Audi', 'Land Rover',
       'Bentley'],
      dtype='object')

## Split data

- To train and test a model:
    - First we will split our data into X_train, X_valid, y_train, y_valid dataset.
    - Second, we have to handle categorical features - one hot encoding.

In [5]:
numerical_cols = [cname for cname in data.columns if 
                data[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols

X = data[my_cols].copy().reset_index(drop=True)
one_hot_df = one_hot_df.reset_index(drop=True)
X = pd.concat([X, one_hot_df], axis=1)

y = X["Price(EUR)"].copy()
X.drop(["Price(EUR)"], axis=1, inplace=True)

X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [21]:

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2289 entries, 0 to 2288
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Interior color                     2289 non-null   object 
 1   Interior material                  2289 non-null   object 
 2   Doors                              2289 non-null   object 
 3   Fuel                               2289 non-null   object 
 4   Transmission                       2289 non-null   object 
 5   Drive type                         2289 non-null   object 
 6   Emission class                     2289 non-null   object 
 7   Condition                          2284 non-null   object 
 8   Seats                              2289 non-null   float64
 9   Power(kW)                          2289 non-null   int64  
 10  CO2 emissions(g/km)                2289 non-null   int64  
 11  Mileage(km)                        2289 non-null   int64

## Create a PipeLine for Cross Validation

### First, we have to define transformer for the Pipeline

- Because numerical col in our data were preprocess already, so we just have to preprocess for categorical.

In [6]:
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, low_cardinality_cols)
    ])

### Second: Fit model and Predict

In [7]:


my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor(n_estimators=50,
                                                              random_state=0))
                             ])

my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 12014.201364980201


## Cross Validation


In [8]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [14958.37390096 11697.98522738 10136.10593482 10785.85164104
 11300.86574205]


## Train model, Test and make Prediction from User Input

In [22]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error

# # function for comparing different approaches
# def score_dataset(X_train, X_valid, y_train, y_valid):
#     model = RandomForestRegressor(n_estimators=100, random_state=0)
#     model.fit(X_train, y_train)
#     preds = model.predict(X_valid)
#     return mean_absolute_error(y_valid, preds)

# print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))
def input_car_info():
    # Initialize an empty dictionary to hold user input
    car_info = {}

    # List all the features
    features = ['Interior color', 'Interior material', 'Doors', 'Fuel', 'Transmission', 
                'Drive type', 'Emission class', 'Condition', 'Seats', 'Power(kW)', 
                'CO2 emissions(g/km)', 'Mileage(km)', 'Consumption(l/100km or kWh/100km)', 
                'Engine capacity(ccm)', 'Previous owners', 'Air suspension', 
                'Ventilated front seats', 'Electric adjustable front seats', 
                'Digital cockpit', 'Burmester audio', 'Heated rear seats', 
                'Laser headlights', 'Adaptive cruise control']

    # Ask the user to input values for each feature
    for feature in features:
        value = input(f"Please enter the {feature} of the car: ")
        car_info[feature] = [value]  # Use a list here because pd.DataFrame expects a list

    # Convert the dictionary to a DataFrame
    X_input = pd.DataFrame(car_info)

    return X_input

# Call the function to get user input and create X_input
X_input = input_car_info()
X_input


Unnamed: 0,Interior color,Interior material,Doors,Fuel,Transmission,Drive type,Emission class,Condition,Seats,Power(kW),...,Engine capacity(ccm),Previous owners,Air suspension,Ventilated front seats,Electric adjustable front seats,Digital cockpit,Burmester audio,Heated rear seats,Laser headlights,Adaptive cruise control
0,Red,Leather,4,Petrol,Manual,4x4,Euro 6d-TEMP,Used,4,200,...,2000,1,1,0,2,1,0,1,0,1


In [26]:
preds = my_pipeline.predict(X_input)
preds


array([39287.96666667])