In [6]:
import pandas as pd

# Charger le fichier CSV
file_path = "../data/raw/train.csv"
df = pd.read_csv(file_path)

# Afficher les premi√®res lignes et les informations g√©n√©rales
df.head(), df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

(   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
 0   1          60       RL         65.0     8450   Pave   NaN      Reg   
 1   2          20       RL         80.0     9600   Pave   NaN      Reg   
 2   3          60       RL         68.0    11250   Pave   NaN      IR1   
 3   4          70       RL         60.0     9550   Pave   NaN      IR1   
 4   5          60       RL         84.0    14260   Pave   NaN      IR1   
 
   LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
 0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
 1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
 2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
 3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
 4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   
 
   YrSold  SaleType  SaleCondition  SalePrice  
 0   2

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

X = df.drop(columns=["SalePrice", "Id"])
y = df["SalePrice"]

# Identifier les colonnes num√©riques et cat√©goriques
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object", "category"]).columns

# Pipelines de transformation
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Transformer les colonnes
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])

# Appliquer la transformation
X_processed = preprocessor.fit_transform(X)
X_processed = np.array(X_processed)  # Convertir en tableau numpy

# S√©parer en train/test
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

print("Pr√©traitement termin√©. X_train pr√™t pour l'entra√Ænement du mod√®le.")


Pr√©traitement termin√©. X_train pr√™t pour l'entra√Ænement du mod√®le.


In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Initialiser le mod√®le
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Entra√Æner le mod√®le
model.fit(X_train, y_train)

# Pr√©dire sur l'ensemble de test
y_pred = model.predict(X_test)

# √âvaluer la performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


Mean Absolute Error: 17776.588732876713


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

# 1. Charger les datasets nettoy√©s
df_train = pd.read_csv("train_cleaned.csv")
df_test = pd.read_csv("test_cleaned.csv")

# 2. S√©parer les variables
X_train = df_train.drop(columns=["SalePrice"])
y_train = df_train["SalePrice"]
X_test = df_test.copy()

# 3. G√©rer les valeurs manquantes
X_train = X_train.fillna(X_train.median(numeric_only=True))
X_test = X_test.fillna(X_train.median(numeric_only=True))  # on utilise la m√©diane de train pour √©viter la fuite

# 4. Mod√®les
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "CatBoost": CatBoostRegressor(iterations=500, learning_rate=0.05, depth=6, verbose=100)
}

# 5. Entra√Ænement & Pr√©dictions
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # üîÅ Sauvegarder les pr√©dictions dans un fichier CSV
    filename = f"{name.lower().replace(' ', '_')}_submission_cleaned.csv"
    submission = pd.DataFrame({
        "Id": df_test["Id"],
        "SalePrice": y_pred
    })
    submission.to_csv(filename, index=False)
    print(f"‚úÖ R√©sultats enregistr√©s dans '{filename}'")

print("\nüìä Comparaison des mod√®les termin√©e !")



Training Linear Regression...
‚úÖ R√©sultats enregistr√©s dans 'linear_regression_submission_cleaned.csv'

Training Random Forest...
‚úÖ R√©sultats enregistr√©s dans 'random_forest_submission_cleaned.csv'

Training CatBoost...
0:	learn: 76741.2780640	total: 4.48ms	remaining: 2.23s
100:	learn: 20705.1145239	total: 399ms	remaining: 1.58s
200:	learn: 16080.8567385	total: 808ms	remaining: 1.2s
300:	learn: 13652.2245176	total: 1.31s	remaining: 866ms
400:	learn: 11576.2833709	total: 1.7s	remaining: 420ms
499:	learn: 9958.1660562	total: 2.09s	remaining: 0us
‚úÖ R√©sultats enregistr√©s dans 'catboost_submission_cleaned.csv'

üìä Comparaison des mod√®les termin√©e !
