# Build Model

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


## Prepare data

### Read data and create a column `Year` to help increase a feature for training model.

In [22]:

data = pd.read_csv("data\cleaned_data_edited.csv",index_col="ID")
data['Year'] = pd.to_datetime(data['First registration']).dt.year.astype(int)
data.columns

df = data.copy()
df['Year'] = 2023 - df['Year']
df.drop(['First registration'],axis=1,inplace=True)



  data['Year'] = pd.to_datetime(data['First registration']).dt.year.astype(int)


### Find object columns and low cardinality columns

Due to our data have many categorical columns, we have to do one hot encoding for training model. Before handling one-hot, we have to:
- Find categorical (object) columns.
- Find columns that have low cardinality.

Why have to find columns that have low cardinality?
- For large datasets with many rows, one-hot encoding can greatly expand the size of the dataset.  For this reason, we typically will only one-hot encode columns with relatively low cardinality.  Then, high cardinality columns can either be dropped from the dataset, or we can use ordinal encoding.

In [23]:
s = (data.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)

low_cardinality_cols = [col for col in object_cols if data[col].nunique() < 10]
# low_cardinality_cols.append("Make")
print("Low cardinality col:")
print (low_cardinality_cols)

Categorical variables:
['CARNAME', 'Make', 'Model', 'Body color', 'Interior color', 'Interior material', 'Body', 'Doors', 'Fuel', 'Transmission', 'Drive type', 'Emission class', 'First registration', 'Condition', 'Tags']
Low cardinality col:
['Interior color', 'Interior material', 'Doors', 'Fuel', 'Transmission', 'Drive type', 'Emission class', 'Condition']


### Find tags


- Due to the column `Tags` is a multiple value column, we have to choose which tag (special function) to do one hot encoding and put to the model 

In [24]:
from scipy.stats import pointbiserialr
import pandas as pd

def correlation_ratio(df, dummies, target):
    correlations = {dummy: pointbiserialr(df[target], dummies[dummy]) for dummy in dummies.columns}
    sorted_correlations = sorted([(dummy, corr) for dummy, (corr, pval) in correlations.items() if pval < 0.05], key=lambda x: abs(x[1]), reverse=True)
    print(sorted_correlations)
    return sorted_correlations

def one_hot(df, col, target, multi=False):
    dummies = df[col].str.get_dummies(sep='; ') if multi else pd.get_dummies(df[col], prefix=col)
    if len(dummies.columns) > 10:
        top_cols = [dummy for dummy, corr in correlation_ratio(df, dummies, target) if abs(corr) > 0.25][:10]
        dummies = dummies[top_cols]
    return pd.concat([df, dummies], axis=1).drop(col, axis=1)

cols = ['Make', 'Body color', 'Interior color', 'Interior material', 'Body', 'Doors', 'Fuel', 'Transmission', 'Drive type', 'Emission class', 'Condition']
for col in cols:
    df = one_hot(df, col, 'Price(EUR)')

df = one_hot(df, 'Tags', 'Price(EUR)', multi=True)


[('Make_Ferrari', 0.4354419438629574), ('Make_Rolls-Royce', 0.4024320949147529), ('Make_Lamborghini', 0.27881078822231), ('Make_Porsche', 0.19303754491641367), ('Make_Mercedes-Benz', 0.17069470520853333), ('Make_BMW', 0.1519367849233362), ('Make_Aston Martin', 0.12180189228198182), ('Make_Audi', 0.10613530836413072), ('Make_Land Rover', 0.10510441252087759), ('Make_Bentley', 0.10503455557759832), ('Make_Citroen', -0.10210411606065636), ('Make_Peugeot', -0.10103701599768346), ('Make_Renault', -0.09138514183609876), ('Make_Opel', -0.09032855708785929), ('Make_Dacia', -0.08108570191159956), ('Make_Fiat', -0.0715703720142489), ('Make_Seat', -0.061670643965112786), ('Make_Skoda', -0.05375512387369492), ('Make_Toyota', -0.05212669978922351), ('Make_Nissan', -0.05026710332160921), ('Make_Volkswagen', -0.04699721883959448), ('Make_Maserati', 0.046887457455933104), ('Make_Subaru', 0.042616055538523344)]
[('Body color_Green', 0.12243359557438337), ('Body color_Black', 0.11577537380802037), ('Bod

In [25]:


# # Vẽ heatmap
# plt.figure(figsize=(10, 8)) # Đặt kích thước của hình
# sns.heatmap(corre.astype(float), annot=True, fmt=".2f") # Vẽ heatmap
# plt.show()

## Split data

- To train and test a model:
    - First we will split our data into X_train, X_valid, y_train, y_valid dataset.
    - Second, we have to handle categorical features - one hot encoding.

In [26]:
del df['CARNAME'], df['Model']
df.columns

Index(['Seats', 'Power(kW)', 'CO2 emissions(g/km)', 'Mileage(km)',
       'Consumption(l/100km or kWh/100km)', 'Price(EUR)',
       'Engine capacity(ccm)', 'Previous owners', 'Year', 'Make_Ferrari',
       'Make_Rolls-Royce', 'Make_Lamborghini', 'Interior color_Beige interior',
       'Interior color_Black interior', 'Interior color_Brown interior',
       'Interior color_Grey interior', 'Interior color_Other interior color',
       'Interior material_Alcantara interior',
       'Interior material_Cloth interior',
       'Interior material_Full leather interior',
       'Interior material_Other interior material',
       'Interior material_Part leather interior',
       'Interior material_Velour interior', 'Body_Cabriolet', 'Body_Cargo VAN',
       'Body_Coupe', 'Body_Hatchback', 'Body_MPV', 'Body_MPV/VAN',
       'Body_Pick-up', 'Body_SUV / offroad', 'Body_Sedans / saloons',
       'Body_Station Wagon', 'Doors_2/3 doors', 'Doors_4/5 doors',
       'Doors_6/7 doors', 'Fuel_CNG', 'Fuel_

In [27]:
X = df.copy()
y = X["Price(EUR)"].copy()
X.drop(["Price(EUR)"], axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=42)

In [28]:
# from sklearn.preprocessing import OneHotEncoder

# OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
# OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# OH_cols_train.index = X_train.index
# OH_cols_valid.index = X_valid.index

# num_X_train = X_train.drop(object_cols, axis=1)
# num_X_valid = X_valid.drop(object_cols, axis=1)

# OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
# OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
# OH_X_train.columns = OH_X_train.columns.astype(str)
# OH_X_valid.columns = OH_X_valid.columns.astype(str)

## Train model, Test and make Prediction

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=850, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

print(score_dataset(X_train, X_valid, y_train, y_valid))

5412.665882870948
