In [55]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [56]:
file_data = r'C:\Users\TP\Desktop\CarPricePredictorInTunisia\Data\CleanedData2.csv'

df = pd.read_csv(file_data)

In [57]:
print(df)

           Brand         Model  Price  Boite vitesse  Cylindrée  \
0     Alfa_Romeo        Giulia  198.0           True      1.995   
1     Alfa_Romeo       Stelvio  265.0           True      1.995   
2           Audi  A3_Sportback  145.0           True      1.395   
3           Audi    A3_Berline  149.0           True      1.395   
4           Audi            A4  198.0           True      1.984   
...          ...           ...    ...            ...        ...   
4849   Ssangyong         Kyron   38.0           True      3.000   
4850      Autres        Autres   50.0          False      1.600   
4851     Peugeot           206   18.0          False      2.400   
4852     Citroen            C4   35.0          False      1.600   
4853  Mitsubishi        Canter   40.0          False      1.200   

      Puissance fiscale  Kilométrage   Age  Essence  Diesel  Electric  
0                    11          0.0   0.0     True   False     False  
1                    17          0.0   0.0     True

In [58]:
brand_model_counts = df.groupby("Brand")["Model"].nunique()

brands_to_keep = brand_model_counts[brand_model_counts >= 9].index

df = df[df["Brand"].isin(brands_to_keep)]

In [59]:
# Step 1: Encode the 'Brand' column with unique integers (same as before)
brand_mapping = {brand: idx + 1 for idx, brand in enumerate(df['Brand'].unique())}
df['Brand_encoded'] = df['Brand'].map(brand_mapping)
print(brand_mapping)
print("-------")

# Step 2: Encode the 'Model' column with a hierarchical format (e.g., 1.1, 1.2, etc.)
model_mapping = {}
for brand in df['Brand'].unique():
    unique_models = df[df['Brand'] == brand]['Model'].unique()
    model_mapping_for_brand = {model: f"{brand_mapping[brand]}.{idx + 1}" for idx, model in enumerate(unique_models)}
    model_mapping[brand] = model_mapping_for_brand
print(model_mapping)
print("-------")

# Apply the model mapping to the dataframe
df['Model_encoded'] = df.apply(lambda row: model_mapping[row['Brand']][row['Model']], axis=1)

df2=df
df = df.drop('Brand', axis=1)
df = df.drop('Model', axis=1)

# Display the DataFrame with encoded columns
print(df)

{'Audi': 1, 'Bmw': 2, 'Chery': 3, 'Chevrolet': 4, 'Citroen': 5, 'Dfsk': 6, 'Fiat': 7, 'Ford': 8, 'Geely': 9, 'Honda': 10, 'Hyundai': 11, 'Jaguar': 12, 'Kia': 13, 'Land_Rover': 14, 'Mercedes_Benz': 15, 'Mg': 16, 'Mitsubishi': 17, 'Nissan': 18, 'Opel': 19, 'Peugeot': 20, 'Renault': 21, 'Seat': 22, 'Skoda': 23, 'Ssangyong': 24, 'Suzuki': 25, 'Toyota': 26, 'Volkswagen': 27, 'Citroën': 28, 'Mazda': 29}
-------
{'Audi': {'A3_Sportback': '1.1', 'A3_Berline': '1.2', 'A4': '1.3', 'A6': '1.4', 'Q2': '1.5', 'Q3': '1.6', 'Q3_Sportback': '1.7', 'Q7': '1.8', 'A5_Sportback': '1.9', 'A5_Coupé': '1.10', 'A7_Sportback': '1.11', 'Q5': '1.12', 'Q4_E_Tron': '1.13', 'A5_Cabriolet': '1.14', 'Q5_Sportback': '1.15', 'Q8': '1.16', 'A3': '1.17', 'Rs5': '1.18', 'A1_Sportback': '1.19', 'A5': '1.20', 'A1': '1.21'}, 'Bmw': {'Serie_1_5p': '2.1', 'Serie_2_Gran_Coupe': '2.2', 'Serie_3': '2.3', 'Serie_4_Gran_Coupe': '2.4', 'Serie_4_Coupe': '2.5', 'Serie_5': '2.6', 'Serie_7': '2.7', 'X1': '2.8', 'X1_Hybride': '2.9', 'X2'

In [60]:


# Re-define X and Y after cleaning the data
X = df.drop(columns=['Price'])
Y = df['Price']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

In [61]:
! pip install xgboost



In [62]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize XGBoost regressor
model = XGBRegressor(
    n_estimators=100,  # Number of trees
    max_depth=3,       # Depth of trees
    learning_rate=0.1, # Step size for updates
    subsample=0.8,     # Fraction of samples used for training
    colsample_bytree=0.8,  # Fraction of features used for each tree
    random_state=42    # Reproducibility
)

# Train the model
model.fit(X_train_scaled, Y_train)

# Predict on the test set
Y_pred = model.predict(X_test_scaled)

# Calculate metrics
mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, Y_pred)

print(f"XGBoost Model")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")
print("-----------")


XGBoost Model
Mean Absolute Error (MAE): 17.69
Mean Squared Error (MSE): 2259.19
R-squared (R²): 0.67
-----------


In [63]:
# Reverse the brand and model mappings for decoding
brand_reverse_mapping = {v: k for k, v in brand_mapping.items()}
model_reverse_mapping = {f"{brand_mapping[brand]}.{idx + 1}": model for brand in df2['Brand'].unique() for idx, model in enumerate(df2[df2['Brand'] == brand]['Model'].unique())}

# Before scaling and transforming, save the indices of the original X_test
X_test_indices = X_test.index

# Use the transformed data for predictions
for i in range(len(Y_pred)):
    diff = abs(Y_test.iloc[i] - Y_pred[i])

    # Only print when the difference is greater than 10
    if diff > 10:
        # Decode the Brand and Model from the encoded values
        brand_name = brand_reverse_mapping[X_test.loc[X_test_indices[i], 'Brand_encoded']]
        model_name = model_reverse_mapping[str(X_test.loc[X_test_indices[i], 'Model_encoded'])]

        car_features = X_test.loc[X_test_indices[i]].drop(['Brand_encoded', 'Model_encoded'])
        # Flatten the car features
        car_info = ", ".join([f"{col}: {value}" for col, value in car_features.items()])

        print(f"The car is: Brand - {brand_name}, Model - {model_name}")  # Display decoded brand and model
        print(f"Features: {car_info}")
        print(f"Predicted price: {Y_pred[i]} | Real price: {Y_test.iloc[i]} | diff: {diff}")
        print("-------")


The car is: Brand - Peugeot, Model - 508
Features: Boite vitesse: True, Cylindrée: 2.2, Puissance fiscale: 12, Kilométrage: 180.0, Age: 12.0, Essence: False, Diesel: True, Electric: False
Predicted price: 70.16060638427734 | Real price: 42.0 | diff: 28.160606384277344
-------
The car is: Brand - Toyota, Model - Yaris_Cross
Features: Boite vitesse: True, Cylindrée: 1.49, Puissance fiscale: 5, Kilométrage: 0.0, Age: 0.0, Essence: True, Diesel: False, Electric: True
Predicted price: 100.1814193725586 | Real price: 119.9 | diff: 19.718580627441412
-------
The car is: Brand - Toyota, Model - Yaris_Verso
Features: Boite vitesse: False, Cylindrée: 1.0, Puissance fiscale: 4, Kilométrage: 200.0, Age: 6.0, Essence: True, Diesel: False, Electric: False
Predicted price: 43.03805923461914 | Real price: 31.5 | diff: 11.53805923461914
-------
The car is: Brand - Land_Rover, Model - Range_Rover_Sport
Features: Boite vitesse: True, Cylindrée: 3.0, Puissance fiscale: 24, Kilométrage: 96.0, Age: 8.0, Ess