In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Charger les données
df = pd.read_csv("train.csv")

# Supprimer les colonnes "name" et "new price"
df.drop(columns=["Name", "New_Price"], inplace=True)

In [3]:
X = df.drop(columns=["Price"])
y = df["Price"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
numeric_features = X.select_dtypes(include=["float64", "int64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

In [6]:
# Transformer pour les caractéristiques numériques
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Transformer pour les caractéristiques catégorielles
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

In [8]:
model = LinearRegression()

In [9]:
from sklearn.feature_selection import RFE

selector = RFE(estimator=model)

In [12]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("selector", selector),
    ("model", model)
])

In [13]:
pipeline.fit(X_train, y_train)

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Faire des prédictions sur l'ensemble de test
y_pred = pipeline.predict(X_test)

# Calculer l'erreur quadratique moyenne (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculer la racine de l'erreur quadratique moyenne (RMSE)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)

# Calculer le coefficient de détermination (R²)
r2 = r2_score(y_test, y_pred)
print("R² Score:", r2)

Mean Squared Error (MSE): 35.90725015126161
Root Mean Squared Error (RMSE): 5.992265861196548
R² Score: 0.7082122671007363
