# Car Model | Final Project
## Bachelor's degree in Economics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
car_model_fipe_xlsx = './car_model_fipe.xlsx'

df_car_model_fipe = pd.read_excel(car_model_fipe_xlsx, engine='openpyxl')
df_car_model_fipe.head()

In [None]:
def calculate_average_mileage(df):
    def calculate_average_line(row):
        numeric_values = pd.to_numeric(row, errors='coerce').dropna()
        if not numeric_values.empty:
            average = round(numeric_values.mean())
            if average == 0:
                return 1
            else:
                return average
        else:
            return 'No information'

    mileage_columns = ['mileage_1', 'mileage_2', 'mileage_3', 'mileage_4', 'mileage_5']
    df_mileage = df[mileage_columns].replace('No information', np.nan)
    return df_mileage.apply(calculate_average_line, axis=1)

def add_average_mileage_to_df(df_car_model_fipe):
    df_car_model_fipe['average_mileage'] = calculate_average_mileage(df_car_model_fipe)
    df_car_model_fipe = df_car_model_fipe.drop(columns=['mileage_1', 'mileage_2', 'mileage_3', 'mileage_4', 'mileage_5'])
    return df_car_model_fipe

df_car_model_fipe = add_average_mileage_to_df(df_car_model_fipe)

In [None]:
from sklearn.model_selection import train_test_split

X = df_car_model_fipe.drop(columns=['average_fipe_price'])
y = df_car_model_fipe['average_fipe_price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42 
)

print(X_train.info())

In [None]:
cat_features = ['model', 'year_model', 'capacity_model', 'power_model', 'doors_model', 'average_mileage']

for col in cat_features:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

print(X_train.info())

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.05, 0.1],
    'depth': [4, 6]
}

from catboost import CatBoostRegressor

catboost = CatBoostRegressor(
    cat_features=cat_features,
    verbose=0,
    random_state=42
)

grid_search = GridSearchCV(
    estimator=catboost,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=2,
    error_score='raise'
)

grid_search.fit(X_train, y_train)

from sklearn.metrics import mean_absolute_error

best_model = grid_search.best_estimator_
print(f"Better parameters: {grid_search.best_params_}")

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f"MAE of CatBoost: {mae}")

In [None]:
cat_features = ['model', 'year_model', 'capacity_model', 'power_model', 'doors_model', 'average_mileage']

for col in cat_features:
    df_car_model_fipe[col] = df_car_model_fipe[col].astype('category')

print(df_car_model_fipe.info())

In [None]:
cols = ['model', 'year_model', 'capacity_model', 'power_model', 'doors_model', 'average_mileage']
df_car_model_fipe_predict = df_car_model_fipe[cols].copy()

df_car_model_fipe_predict['average_fipe_price'] = df_car_model_fipe['average_fipe_price']

df_car_model_fipe_predict['expected_price_ai'] = best_model.predict(df_car_model_fipe_predict)

print("Simulated ads with price suggested by AI:")
print(df_car_model_fipe_predict)

In [None]:
def rate_fipe_price(expected_price_ai, average_fipe_price):
    diff = abs(expected_price_ai - average_fipe_price) / average_fipe_price * 100
    if diff <= 10:
        return "Fair" # 🟢
    elif diff <= 20:
        return "Moderate" # 🟡
    else:
        return "Expensive" # 🔴

df_car_model_fipe_predict['lighthouse_fipe'] = df_car_model_fipe_predict.apply(lambda x: rate_fipe_price(x['expected_price_ai'], x['average_fipe_price']), axis=1)

df_car_model_fipe_predict['anchor_price_ai'] = df_car_model_fipe_predict['expected_price_ai'] * 1.20

def rate_fipe_anchor_price(anchor_price_ai, expected_price_ai):
    diff = abs(anchor_price_ai - expected_price_ai) / expected_price_ai * 100
    if diff <= 10:
        return "Fair" # 🟢
    elif diff <= 20:
        return "Moderate" # 🟡
    else:
        return "Expensive" # 🔴

df_car_model_fipe_predict['lighthouse_anchor_ai'] = df_car_model_fipe_predict.apply(lambda x: rate_fipe_anchor_price(x['anchor_price_ai'], x['expected_price_ai']), axis=1)

print("\nTraffic light ads:")
print(df_car_model_fipe_predict)

In [None]:
df_statistics = df_car_model_fipe_predict

df_statistics['detour_fipe'] = abs(df_statistics['expected_price_ai'] - df_statistics['average_fipe_price'])
df_statistics['detour_anchor'] = abs(df_statistics['expected_price_ai'] - df_statistics['anchor_price_fipe'])

df_statistics['detour_percent_fipe'] = (df_statistics['detour_fipe'] / df_statistics['average_fipe_price']) * 100
df_statistics['detour_percent_anchor'] = (df_statistics['detour_anchor'] / df_statistics['anchor_price_fipe']) * 100

average_detour_fipe = df_statistics['detour_percent_fipe'].mean()
average_detour_anchor = df_statistics['detour_percent_anchor'].mean()

if average_detour_fipe < average_detour_anchor:
    print(f"Average to detour percent (IA, FIPE): {average_detour_fipe:.2f}%")
    print(f"Average to detour percent (IA, Anchor): {average_detour_anchor:.2f}%")
    print()
    print("The detour from FIPE is smaller than the detour from the anchor price.")
    print("AI is less influenced by price anchoring bias and provides a price estimate closer to the 'real' market value (FIPE).")
else:
    print(f"Average to detour percent (IA, FIPE): {average_detour_fipe:.2f}%")
    print(f"Average to detour percent (IA, Âncora): {average_detour_anchor:.2f}%")
    print()
    print("The detour from FIPE is no less than the detour from the anchor price.")
    print("AI may be more influenced by price anchoring bias.")

sns.boxplot(data=df_statistics[['detour_percent_fipe', 'detour_percent_anchor']])
plt.show()

In [None]:
print("Importance of the Model characteristics for consumer research")
print()

feature_importances = best_model.get_feature_importance()
feature_names = X_train.columns

importance_df = pd.DataFrame({
    'Features': feature_names,
    'Importance (%)': (feature_importances / feature_importances.sum()) * 100
}).sort_values(by='Importance (%)', ascending=False)

print(importance_df)

plt.figure(figsize=(10, 5))
plt.barh(importance_df['Features'], importance_df['Importance (%)'], color='skyblue')
plt.xlabel('Importance (%)')
plt.ylabel('Features')
plt.title('Importance of the Features in the Model')
plt.gca().invert_yaxis()
plt.show()

In [None]:
df_car_model_fipe_predict.to_excel("car_model_AI_XAI.xlsx", index=False, engine='openpyxl')

print("Extraction completed. File saved.")