In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5157360%2F8616644%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240606%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240606T111922Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1ca4d42356572fbefeb15d2cc37d1e95d8201d749098f4517ed9b14ede24f8154fd580937db2fc126103fc2dce2e0e35176b65c97424f97af8b6fe04ad6b94091a7cb414d6ec7f6e5595821307be106aa935a36553a6e0108553e3172e073499c94421037257071a8b38e075edff457920d4f5aed0c40f5e9df9e746d296e532e0cf8ff03d70d77f0a550caf365a50e6d910609f959aa4f41173e1dd75da93de70302487c6001bbce6756d1b1b0d00ad6955ffb2b33768f1ced92b4a1883f4dfa8f2931e69eba5e21697fbde50f30795503606c0f45dd2eb45421a0c7076bdf1bea21c32110d75b873c29ce0450417f801bd5e5b216626b1194fb978bdb3c19b'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np # linear algebra
import pandas as pd


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/dataset/clean_fused_dataset (2).csv")
# 'mark', 'model', 'price', 'isofix', 'led', 'cruise_control', 'bluetooth'
df.head()

In [None]:
df = df.drop(columns=["Unnamed: 0"])

In [None]:
df = df.drop(columns=["isautoscout"])

In [None]:
df.columns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


label_encoder = LabelEncoder()
categorical_features = df.select_dtypes(exclude=['number']).columns.tolist()
X = df.copy()

for col in categorical_features:
    X[col] = label_encoder.fit_transform(X[col])
y = X["price"]
# X = X.drop(columns=["price", "description"], axis=1)

X = X.drop(columns=["price"], axis=1)
X.head()

In [None]:
len(df["description"].unique())

In [None]:
df.info()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_regressor.fit(X_train, y_train)
# feature_importances = rf_regressor.feature_importances_
# feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
# print(feature_importance_df)
# y_pred = rf_regressor.predict(X_test)
# mse = mean_absolute_error(y_test, y_pred)
# print(f"Mean Squared Error: {mse}")

# My features: mark, model, price, fuel, isofix, led, cruise_control, bluetooth.

## mark

In [None]:
import matplotlib.pyplot as plt
category_counts = df['mark'].value_counts()
plt.figure(figsize=(8, 8))  # Optional: Adjust the figure size
plt.pie(category_counts, labels=df['mark'].unique(), autopct='%1.1f%%', startangle=140)
plt.xlabel('Brands')
plt.ylabel('Number of Cars')
plt.title('Number of Rows for Each Brand')
plt.show()

## Getting the brands under 1% of the dataset

In [None]:
category_counts = df['mark'].value_counts()
category_counts[(category_counts / df.shape[0] *100) < 1] / df.shape[0] *100

In [None]:
minority_brands = category_counts[(category_counts / df.shape[0] *100) < 1].index.tolist()

## Groping the minority brands into a category called 'OTHERS'  

In [None]:
for b in minority_brands:
    df.loc[df["mark"] == b, "mark"] = "OTHERS"

In [None]:
import matplotlib.pyplot as plt
category_counts = df['mark'].value_counts()
plt.figure(figsize=(8, 8))  # Optional: Adjust the figure size
plt.pie(category_counts, labels=df['mark'].unique(), autopct='%1.1f%%', startangle=140)
plt.xlabel('Brands')
plt.ylabel('Number of Cars')
plt.title('Number of Rows for Each Brand')
plt.show()

## Now we try to undersample the two brands "RENAULT", "CITROEN" and "PEUGEOT"

In [None]:
renault_indexes = df[df["mark"] == "RENAULT"].index
random_indices_renault = pd.Series(renault_indexes).sample(frac=0.5, random_state=42)
df.drop(random_indices_renault, inplace=True)

In [None]:
renault_indexes = df[df["mark"] == "CITROEN"].index
random_indices_renault = pd.Series(renault_indexes).sample(frac=0.5, random_state=42)
df.drop(random_indices_renault, inplace=True)

In [None]:
renault_indexes = df[df["mark"] == "PEUGEOT"].index
random_indices_renault = pd.Series(renault_indexes).sample(frac=0.5, random_state=42)
df.drop(random_indices_renault, inplace=True)

## We use oversampling for the brands under 5%

In [None]:
minority_brands_under_3 = category_counts[(category_counts / df.shape[0] *100) < 3].index.tolist()
minority_brands_under_5 = category_counts[((category_counts / df.shape[0] *100) > 3) & ((category_counts / df.shape[0] *100) < 5)].index.tolist()

In [None]:
for b in minority_brands_under_3:
    brand_df = df[df["mark"] == b]
    brand_df = pd.concat([brand_df,brand_df] , axis=0, ignore_index=True)
    df = pd.concat([df, brand_df], axis=0, ignore_index=True)

In [None]:
for b in minority_brands_under_5:
    brand_df = df[df["mark"] == b]
    df = pd.concat([df, brand_df], axis=0, ignore_index=True)

## We see that the brands are far more balanced now

In [None]:
import matplotlib.pyplot as plt
category_counts = df['mark'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(category_counts, labels=df['mark'].unique(), autopct='%1.1f%%', startangle=140)
plt.xlabel('Brands')
plt.ylabel('Number of Cars')
plt.title('Number of Rows for Each Brand')
plt.show()

## model:

### This is the number of models for each brand

In [None]:
model_counts_by_brand = df.groupby('mark')['model'].nunique()
print(model_counts_by_brand)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
model_counts_by_brand = df.groupby('mark')['model'].nunique().reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(12, 8))
sns.barplot(data=model_counts_by_brand, x='mark', y='model', palette='viridis')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.xlabel('Brand')
plt.ylabel('Number of Models')
plt.title('Number of Models for Each Brand')
plt.tight_layout()
plt.show()


## We are interested in "DACIA" as we can see, it has the second most numbers of samples but the least numbers of unique model values, this indicates that there are a lot of repititions in each of these models, let's find out


In [None]:
df[df["mark"] == "DACIA"]["model"].value_counts() / len(df[df["mark"] == "DACIA"]) * 100


In [None]:
dacia_model_distribution = df[df["mark"] == "DACIA"]["model"].value_counts() / len(df[df["mark"] == "DACIA"]) * 100
plt.figure(figsize=(8, 8))
plt.pie(dacia_model_distribution, labels=dacia_model_distribution.index, autopct='%1.1f%%', startangle=140)
plt.title('Model Distribution for DACIA')
plt.axis('equal')
plt.show()

## We see that the model "Sandero" and "Duster" are overrepresented
## We can use undersampling technique

In [None]:
sandero_indices = df[df["model"] == "Sandero"].index.tolist()
random_sandero_indices = pd.Series(sandero_indices).sample(frac=0.8, random_state=42)
df.drop(random_sandero_indices, inplace=True)

In [None]:
duster_indices = df[df["model"] == "Duster"].index.tolist()
random_duster_indices = pd.Series(duster_indices).sample(frac=0.8, random_state=42)
df.drop(random_duster_indices, inplace=True)

In [None]:
dacia_model_distribution = df[df["mark"] == "DACIA"]["model"].value_counts() / len(df[df["mark"] == "DACIA"]) * 100
plt.figure(figsize=(8, 8))
plt.pie(dacia_model_distribution, labels=dacia_model_distribution.index, autopct='%1.1f%%', startangle=140)
plt.title('Model Distribution for DACIA')
plt.axis('equal')
plt.show()


## We can oversample the models under 10%

In [None]:
models_pourcentages = df[df["mark"] == "DACIA"]["model"].value_counts() / len(df[df["mark"] == "DACIA"]) * 100
under_10_models = models_pourcentages[models_pourcentages < 10].index.tolist()
for model in under_10_models:
    df_added = df[df["model"] == model]
    df = pd.concat([df, df_added], axis=0, ignore_index=True)

## We redo that

In [None]:
models_pourcentages = df[df["mark"] == "DACIA"]["model"].value_counts() / len(df[df["mark"] == "DACIA"]) * 100
under_10_models = models_pourcentages[models_pourcentages < 10].index.tolist()
for model in under_10_models:
    df_added = df[df["model"] == model]
    df = pd.concat([df, df_added], axis=0, ignore_index=True)

In [None]:
dacia_model_distribution = df[df["mark"] == "DACIA"]["model"].value_counts() / len(df[df["mark"] == "DACIA"]) * 100
plt.figure(figsize=(8, 8))
plt.pie(dacia_model_distribution, labels=dacia_model_distribution.index, autopct='%1.1f%%', startangle=140)
plt.title('Model Distribution for DACIA')
plt.axis('equal')
plt.show()


## The models are pretty balanced

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
categorical_features = df.select_dtypes(exclude=['number']).columns.tolist()
X = df.copy()
for col in categorical_features:
    X[col] = label_encoder.fit_transform(X[col])

In [None]:
y = X["price"]
X = X.drop(columns=["price", "description"], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

rf_regressor.fit(X_train, y_train)

feature_importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

In [None]:
y_pred = rf_regressor.predict(X_test)
mse = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Price

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(df.index, df['price'], color='blue', alpha=0.5)  # Scatter plot
plt.title('Scatter Plot of Price')
plt.xlabel('Index')
plt.ylabel('Price')
plt.grid(True)
plt.show()

## We can spott some outliers in the price

In [None]:
plt.figure(figsize=(8, 6))
plt.boxplot(df['price'])
plt.title('Box Plot of Car Prices')
plt.ylabel('Price')
plt.show()


## The boxplot highlights a very high imbalance in the data

## Trying to filter the dataset from the outliers

In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_clean = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(df_clean.index, df_clean['price'], color='blue', alpha=0.5)
plt.title('Scatter Plot of Price')
plt.xlabel('Index')
plt.ylabel('Price')
plt.grid(True)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


label_encoder = LabelEncoder()
categorical_features = df_clean.select_dtypes(exclude=['number']).columns.tolist()
X = df_clean.copy()
for col in categorical_features:
    X[col] = label_encoder.fit_transform(X[col])
y = X["price"]
X = X.drop(columns=["price", "description"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)
feature_importances = rf_regressor.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
y_pred = rf_regressor.predict(X_test)
mse = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Regression

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.metrics import r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
results_without_scaler = {'Model': [], 'MSE Train': [], 'MAE Train': [], 'R2 Train': [], 'MSE Test': [], 'MAE Test': [], 'R2 Test': []}
results_with_scaler = {'Model': [],  'MSE Train': [], 'MAE Train': [], 'R2 Train': [], 'MSE Test': [], 'MAE Test': [], 'R2 Test': []}


In [None]:
def calculate_regression_metrics(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    return mse_train,mae_train,r2_train,mse_test,mae_test,r2_test

## Linear Regression :  

In [None]:
linearRegression = Pipeline([('Linear Regression', LinearRegression())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste = calculate_regression_metrics(linearRegression, X_train, X_test, y_train, y_test)

results_without_scaler['Model'].append('Linear Regression')
results_without_scaler['MSE Train'].append(mse_train)
results_without_scaler['MAE Train'].append(mae_train)
results_without_scaler['R2 Train'].append(r2_train)
results_without_scaler['MSE Test'].append(mse_test)
results_without_scaler['MAE Test'].append(mae_test)
results_without_scaler['R2 Test'].append(r2_teste)

In [None]:
linearRegression_withStandarisation = Pipeline([('scaler', StandardScaler()),('Linear Regression', LinearRegression())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste = calculate_regression_metrics(linearRegression_withStandarisation, X_train, X_test, y_train, y_test)

results_with_scaler['Model'].append('Linear Regression')
results_with_scaler['MSE Train'].append(mse_train)
results_with_scaler['MAE Train'].append(mae_train)
results_with_scaler['R2 Train'].append(r2_train)
results_with_scaler['MSE Test'].append(mse_test)
results_with_scaler['MAE Test'].append(mae_test)
results_with_scaler['R2 Test'].append(r2_teste)

## Decision Tree

In [None]:
decisionTree = Pipeline([('Decision Tree', DecisionTreeRegressor())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste = calculate_regression_metrics(decisionTree, X_train, X_test, y_train, y_test)

results_without_scaler['Model'].append('Decision Tree')
results_without_scaler['MSE Train'].append(mse_train)
results_without_scaler['MAE Train'].append(mae_train)
results_without_scaler['R2 Train'].append(r2_train)
results_without_scaler['MSE Test'].append(mse_test)
results_without_scaler['MAE Test'].append(mae_test)
results_without_scaler['R2 Test'].append(r2_teste)

In [None]:
decisionTree_with_scaler = Pipeline([('scaler', StandardScaler()),('Decision Tree', DecisionTreeRegressor())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste= calculate_regression_metrics(decisionTree_with_scaler, X_train, X_test, y_train, y_test)

results_with_scaler['Model'].append('Decision Tree')
results_with_scaler['MSE Train'].append(mse_train)
results_with_scaler['MAE Train'].append(mae_train)
results_with_scaler['R2 Train'].append(r2_train)
results_with_scaler['MSE Test'].append(mse_test)
results_with_scaler['MAE Test'].append(mae_test)
results_with_scaler['R2 Test'].append(r2_teste)


## Random Forest

In [None]:
randomForest = Pipeline([('Random Forest', RandomForestRegressor())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste= calculate_regression_metrics(decisionTree_with_scaler, X_train, X_test, y_train, y_test)

results_without_scaler['Model'].append('Random Forest')
results_without_scaler['MSE Train'].append(mse_train)
results_without_scaler['MAE Train'].append(mae_train)
results_without_scaler['R2 Train'].append(r2_train)
results_without_scaler['MSE Test'].append(mse_test)
results_without_scaler['MAE Test'].append(mae_test)
results_without_scaler['R2 Test'].append(r2_teste)

In [None]:
randomForest_with_scaler = Pipeline([('scaler', StandardScaler()),('Random Forest', RandomForestRegressor())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste= calculate_regression_metrics(decisionTree_with_scaler, X_train, X_test, y_train, y_test)

results_with_scaler['Model'].append('Random Forest')
results_with_scaler['MSE Train'].append(mse_train)
results_with_scaler['MAE Train'].append(mae_train)
results_with_scaler['R2 Train'].append(r2_train)
results_with_scaler['MSE Test'].append(mse_test)
results_with_scaler['MAE Test'].append(mae_test)
results_with_scaler['R2 Test'].append(r2_teste)

In [None]:
# models = [
#     ('Linear Regression', LinearRegression()),
#     ('Ridge Regression', Ridge()),
#     ('Lasso Regression', Lasso()),
#     ('Decision Tree', DecisionTreeRegressor()),
#     ('Random Forest', RandomForestRegressor()),
#     ('Gradient Boosting', GradientBoostingRegressor()),
#     ('Support Vector Regressor', SVR()),
#     ('K-Neighbors Regressor', KNeighborsRegressor()),
#     ('Extra Trees', ExtraTreesRegressor()),
#     # Uncomment the following line if you have xgboost installed
#     # ('XGBoost', XGBRegressor())
# ]



## Support Vector Regressor

In [None]:
supportVectorRegressor = Pipeline([('Support Vector Regressor', SVR())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste= calculate_regression_metrics(decisionTree_with_scaler, X_train, X_test, y_train, y_test)

results_without_scaler['Model'].append('Support Vector Regressor')
results_without_scaler['MSE Train'].append(mse_train)
results_without_scaler['MAE Train'].append(mae_train)
results_without_scaler['R2 Train'].append(r2_train)
results_without_scaler['MSE Test'].append(mse_test)
results_without_scaler['MAE Test'].append(mae_test)
results_without_scaler['R2 Test'].append(r2_teste)

In [None]:
supportVectorRegressor_with_scaler = Pipeline([('scaler', StandardScaler()),('Support Vector Regressor', SVR())])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste= calculate_regression_metrics(decisionTree_with_scaler, X_train, X_test, y_train, y_test)

results_with_scaler['Model'].append('Support Vector Regressor')
results_with_scaler['MSE Train'].append(mse_train)
results_with_scaler['MAE Train'].append(mae_train)
results_with_scaler['R2 Train'].append(r2_train)
results_with_scaler['MSE Test'].append(mse_test)
results_with_scaler['MAE Test'].append(mae_test)
results_with_scaler['R2 Test'].append(r2_teste)

## lgbm_model

In [None]:
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor()

pipeline_without_scaling = Pipeline([('model', lgbm_model) ])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste= calculate_regression_metrics(pipeline_without_scaling, X_train, X_test, y_train, y_test)

results_without_scaler['Model'].append('lgbm')
results_without_scaler['MSE Train'].append(mse_train)
results_without_scaler['MAE Train'].append(mae_train)
results_without_scaler['R2 Train'].append(r2_train)
results_without_scaler['MSE Test'].append(mse_test)
results_without_scaler['MAE Test'].append(mae_test)
results_without_scaler['R2 Test'].append(r2_teste)

In [None]:
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor()

pipeline_without_scaling = Pipeline([('model', lgbm_model) ])
mse_train,mae_train,r2_train,mse_test,mae_test,r2_teste= calculate_regression_metrics(pipeline_without_scaling, X_train, X_test, y_train, y_test)

results_without_scaler['Model'].append('lgbm')
results_without_scaler['MSE Train'].append(mse_train)
results_without_scaler['MAE Train'].append(mae_train)
results_without_scaler['R2 Train'].append(r2_train)
results_without_scaler['MSE Test'].append(mse_test)
results_without_scaler['MAE Test'].append(mae_test)
results_without_scaler['R2 Test'].append(r2_teste)

## xgb_model

In [None]:
import pandas as pd

# Assuming you have defined the dictionaries results_without_scaler and results_with_scaler

# Convert dictionaries to DataFrames
df_results_without_scaler = pd.DataFrame(results_without_scaler)
df_results_with_scaler = pd.DataFrame(results_with_scaler)

# Merge results for each model, keeping the same index
df_results = pd.concat([df_results_without_scaler, df_results_with_scaler], axis=1)

# Rename columns to differentiate between results with and without normalization
columns_without_scaler = [f"{col}" for col in df_results_without_scaler.columns]
columns_with_scaler = [f"{col}" for col in df_results_with_scaler.columns]
df_results.columns = columns_without_scaler + columns_with_scaler

# Display the combined DataFrame
df_results


In [None]:
import matplotlib.pyplot as plt

# Renommer les clés
new_columns = {
    'MSE Train': 'Train MSE',
    'MAE Train': 'Train MAE',
    'R2 Train': 'Train R2',
    'MSE Test': 'Test MSE',
    'MAE Test': 'Test MAE',
    'R2 Test': 'Test R2'
}

df_results = df_results.rename(columns=new_columns)

# Noms des algorithmes
model_names = ['Linear Regression', 'Decision Tree', 'Random Forest', 'Support Vector Regressor']

# Tracé des résultats de MSE
plt.figure(figsize=(12, 6))
df_results.filter(like='MSE').plot(kind='bar', width=0.8)
plt.title('Mean Squared Error (MSE)')
plt.xlabel('Model')
plt.ylabel('MSE')
plt.xticks(range(len(model_names)), model_names, rotation=45)
plt.legend(title='Scaling')
plt.tight_layout()
plt.show()

# Tracé des résultats de MAE
plt.figure(figsize=(12, 6))
df_results.filter(like='MAE').plot(kind='bar', width=0.8)
plt.title('Mean Absolute Error (MAE)')
plt.xlabel('Model')
plt.ylabel('MAE')
plt.xticks(range(len(model_names)), model_names, rotation=45)
plt.legend(title='Scaling')
plt.tight_layout()
plt.show()

# Tracé des résultats de R2
plt.figure(figsize=(12, 6))
df_results.filter(like='R2').plot(kind='bar', width=0.8)
plt.title('R-squared (R2)')
plt.xlabel('Model')
plt.ylabel('R2')
plt.xticks(range(len(model_names)), model_names, rotation=45)
plt.legend(title='Scaling')
plt.tight_layout()
plt.show()


In [None]:
# Supposons que vous ayez déjà défini les DataFrames df_results_without_scaler et df_results_with_scaler

# Convertir les colonnes en listes
columns_without_scaler = df_results_without_scaler.columns.tolist()
columns_with_scaler = df_results_with_scaler.columns.tolist()

# Supprimer la colonne "Model" car elle n'est pas une métrique
columns_without_scaler.remove("Model")
columns_with_scaler.remove("Model")

# Afficher la différence entre les métriques pour chaque modèle
for model_name in df_results_without_scaler["Model"]:
    print(f"Model: {model_name}")
    for col_without_scaler, col_with_scaler in zip(columns_without_scaler, columns_with_scaler):
        difference = df_results_with_scaler.loc[df_results_with_scaler["Model"] == model_name, col_with_scaler].values[0] - df_results_without_scaler.loc[df_results_without_scaler["Model"] == model_name, col_without_scaler].values[0]
        print(f"{col_with_scaler} - {col_without_scaler}: {difference}")
    print("\n")


In [None]:
import matplotlib.pyplot as plt

# Métriques et leurs différences
metrics = ['MSE Train', 'MAE Train', 'R2 Train', 'MSE Test', 'MAE Test', 'R2 Test']


# Plot
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Metrics standarisation - Metrics sans standarisation')

for i, metric in enumerate(metrics):
    row = i // 3
    col = i % 3
    axs[row, col].bar(difference.keys(), [diff[i] for diff in difference.values()])
    axs[row, col].set_title(metric)
    axs[row, col].set_ylabel('Difference')

plt.tight_layout()
plt.show()


In [None]:
print("Length of lists in results_without_scaler:")
for key, value in results_without_scaler.items():
    print(f"{key}: {len(value)}")

print("\nLength of lists in results_with_scaler:")
for key, value in results_with_scaler.items():
    print(f"{key}: {len(value)}")



## Linear Regression:

Performance : Ce modèle montre les MSE et MAE les plus élevés pour les ensembles d'entraînement et de test, indiquant une mauvaise performance.

Raison : La régression linéaire peut ne pas capturer la complexité des données en raison de sa simplicité et de sa linéarité.

## Polynomial Regression:

Performance : Amélioration par rapport à la régression linéaire avec des MSE et MAE plus bas, et un 𝑅2 plus élevé.

Raison : La régression polynomiale capture une partie de la non-linéarité mais peut encore être insuffisante pour les schémas de données très complexes.

## Decision Tree:

Performance : Excellente performance d'entraînement (presque zéro erreur)

Raison : Les arbres de décision ont tendance à surajuster les données d'entraînement mais en remarque pas ce probleme dans ce cas

## Random Forest:

Performance : Performance de test légèrement meilleure comparée à un seul arbre de décision, avec des MSE, MAE et 𝑅2 équilibrés.

Raison : La forêt aléatoire réduit le surapprentissage en moyennant plusieurs arbres de décision, ce qui conduit à une meilleure généralisation

## Support Vector Regressor (SVR):

Performance : Comparable à la forêt aléatoire avec des métriques légèrement différentes, montrant une bonne généralisation.

Raison : Le SVR peut gérer efficacement les données de haute dimension et offre un bon équilibre entre biais et variance.


## Sélection du Meilleur Modèle:

Support Vector Regressor est le meilleur modèle parmi ceux évalués. Elle offre un bon équilibre entre les performances d'entraînement et de test, comme indiqué par le MSE et le MAE relativement bas sur le jeu de test et une valeur 𝑅2 élevée. La capacité de Support Vector Regressore à réduire le surapprentissage et à capturer des schémas complexes dans les données en fait un choix approprié.

## Conclusion:

Régression Linéaire et Régression Polynomiale : Pas adaptées en raison des taux d'erreur élevés.

random forest ,Desicion tree : Également un bon choix mais la forêt aléatoire est légèrement meilleure en termes de métriques globales
