In [None]:
pip

In [3]:
from load_data import load_dataframes
import os
import sys
from pathlib import Path
import zipfile
import pandas as pd
from typing import Optional, Tuple


In [4]:
train_path= '/Users/matthieu/Downloads/Tree-Model-Comparison-Regression-main/house_prices/data/train.csv'
test_path ='/Users/matthieu/Downloads/Tree-Model-Comparison-Regression-main/house_prices/data/test.csv'

df_test = pd.read_csv(test_path)
df_train = pd.read_csv(train_path)


In [None]:
######""
-les colonnes quantitatives avec moins de 130 valeur differentes, on applique le frequency encoder
-les colonnes quantitatives avec plus de 130 valeur differentes, on applique le MinMaxScalerEncoder
-les colonnes qualitatives avec moins de 5 modalités différentes : on applique le one hot encoder 

-les colonnes qualitativees avec plus de 5 modalités differentes on applique le frequency encoder
######""

In [45]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler

class MinMaxScalerTransformer(BaseEstimator, TransformerMixin):
    """Applique MinMaxScaler aux colonnes spécifiées."""
    
    def __init__(self, cols, feature_range=(0, 1)):
        self.cols = cols
        self.feature_range = feature_range
        self.scaler = MinMaxScaler(feature_range=feature_range)
    
    def fit(self, X, y=None):
        # Appliquer le fit uniquement sur les colonnes spécifiées
        self.scaler.fit(X[self.cols])
        return self
    
    def transform(self, X):
        # Appliquer la transformation uniquement sur les colonnes spécifiées
        X_transformed = X.copy()
        X_transformed[self.cols] = self.scaler.transform(X[self.cols])
        X_transformed[self.cols] = X_transformed[self.cols].fillna(0)

        return X_transformed


class FrequencyEncoderTransformer(BaseEstimator, TransformerMixin):
    """Applique un encodage basé sur la fréquence aux colonnes spécifiées."""

    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_transformed = X.copy()

        # Appliquer le Frequency Encoding sur les colonnes spécifiées
        for col in self.cols:
            freq_encoding = X[col].value_counts(normalize=True)
            X_transformed[col] = X_transformed[col].map(freq_encoding)
        
        X_transformed[self.cols] = X_transformed[self.cols].fillna(0)
        return X_transformed

class OneHotEncoderTransformer(BaseEstimator, TransformerMixin):
    """Applique un One-Hot Encoding classique aux colonnes spécifiées en utilisant pd.get_dummies."""
    
    def __init__(self, cols):
        self.cols = cols  # Les colonnes à transformer

    def fit(self, X, y=None):
        # Aucune opération nécessaire dans fit ici
        return self
    
    def transform(self, X):
        X_transformed = X.copy()
        
        for col in self.cols:
            # Utilisation de pd.get_dummies pour générer des colonnes one-hot
            dummies = pd.get_dummies(X_transformed[col], prefix=col, drop_first=False)
            
            # Remplacement des True/False par 1/0
            dummies = dummies.astype(int)
            
            # Concatenation des nouvelles colonnes à notre DataFrame d'origine
            X_transformed = pd.concat([X_transformed, dummies], axis=1)
            
            # Suppression de la colonne d'origine
            X_transformed.drop(columns=[col], inplace=True)
            X_transformed = X_transformed.fillna(0)

        return X_transformed



In [52]:
from sklearn.pipeline import Pipeline

def apply_transforms(df):
    colonnes_quantitatives = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    colonnes_qualitatives = df.select_dtypes(include=['object']).columns.tolist()

    colonnes_quantitatives_min = [col for col in colonnes_quantitatives if df[col].nunique() <= 130]
    colonnes_quantitatives_max = [col for col in colonnes_quantitatives if df[col].nunique() > 130]

    colonnes_qualitatives_min = [col for col in colonnes_qualitatives if df[col].nunique() <= 5]
    colonnes_qualitatives_max = [col for col in colonnes_qualitatives if df[col].nunique() > 5]

    transformers = [
        ('quantitative_max', MinMaxScalerTransformer(cols=colonnes_quantitatives_max)),
        
        ('quantitative_min', FrequencyEncoderTransformer(cols=colonnes_quantitatives_min)),
        
        ('qualitative_min', OneHotEncoderTransformer(cols=colonnes_qualitatives_min)),
        
        ('qualitative_max', FrequencyEncoderTransformer(cols=colonnes_qualitatives_max))
    ]
    
    column_transformer = Pipeline(
        steps=transformers    )
    
    return column_transformer.fit_transform(df)


class ApplyTransforms(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        # On n'a pas de fit spécifique ici car nous appliquons uniquement des transformations.
        return self

    def transform(self, X):
        # Appliquer toutes les transformations comme dans la fonction `apply_transforms`
        return apply_transforms(X)

In [50]:
# Appliquer les transformations
X_train = df_train.drop(columns=['SalePrice'])
print(X_train.shape)

# Créer la variable y avec la colonne SalePrice
y = df_train['SalePrice']

# Appliquer les transformations à X_train
X_train_transformed = apply_transforms(X_train)

# Sauvegarder le DataFrame transformé en CSV
transformed_df = pd.DataFrame(X_train_transformed)
transformed_df.to_csv('/Users/matthieu/Downloads/Tree-Model-Comparison-Regression-main/house_prices/X_train_transformed.csv', index=False)

print("DataFrame transformé sauvegardé avec succès.")
print(transformed_df.head())


(1460, 80)
DataFrame transformé sauvegardé avec succès.
         Id  MSSubClass  LotFrontage   LotArea  Neighborhood  Condition1  \
0  0.000000    0.204795     0.036636  0.033420      0.102740    0.863014   
1  0.000685    0.367123     0.057452  0.038795      0.007534    0.055479   
2  0.001371    0.204795     0.015820  0.046507      0.102740    0.863014   
3  0.002056    0.041096     0.119067  0.038561      0.034932    0.863014   
4  0.002742    0.204795     0.007494  0.060576      0.028082    0.863014   

   Condition2  HouseStyle  OverallQual  OverallCond  ...  Fence_0  \
0    0.989726    0.304795     0.218493     0.562329  ...        1   
1    0.989726    0.497260     0.256164     0.049315  ...        1   
2    0.989726    0.304795     0.218493     0.562329  ...        1   
3    0.989726    0.304795     0.218493     0.562329  ...        1   
4    0.989726    0.304795     0.115068     0.562329  ...        1   

   Fence_GdPrv  Fence_GdWo  Fence_MnPrv  Fence_MnWw  MiscFeature_0  \
0 

In [56]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score


# 1. Séparation des données
X = df_train.drop(columns=['SalePrice'])  # X contient toutes les colonnes sauf SalePrice
y = df_train['SalePrice']  # y contient la cible SalePrice

# 2. Diviser les données en ensembles d'entraînement et de test


# 3. Créer le pipeline avec la classe `ApplyTransforms` et le modèle `ExtraTreesRegressor`
pipeline = Pipeline(steps=[
    ('column_transformer', ApplyTransforms()),  # Utilisation de la classe ApplyTransforms
    ('model', ExtraTreesRegressor(random_state=42))  # Modèle ExtraTreesRegressor
])

X_encoded= ApplyTransforms().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# 4. Cross-validation (en utilisant 5-fold)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# 5. Afficher les résultats de la cross-validation
print(f"Cross-validation scores: {cv_scores}")
print(f"MSE moyen: {-np.mean(cv_scores)}")

# 6. Fit le modèle sur l'ensemble d'entraînement complet
pipeline.fit(X_train, y_train)

# 7. Faire des prédictions sur l'ensemble de test
y_pred = pipeline.predict(X_test)

# 8. Afficher les prédictions et la performance
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error sur l'ensemble de test: {mse}")

# 10. Calculer le R² score
r2 = r2_score(y_test, y_pred)
print(f"R² score sur l'ensemble de test: {r2}")


Cross-validation scores: [-2.21574440e+09 -2.18962414e+09 -2.67072846e+09 -2.67047721e+09
 -2.40180517e+09]
MSE moyen: 2429675875.3370223
Mean Squared Error sur l'ensemble de test: 2117796767.1827927
R² score sur l'ensemble de test: 0.7238974314810225


In [5]:
def find_transforms(df):
    colonnes_quantitatives = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    colonnes_qualitatives = df.select_dtypes(include=['object']).columns.tolist()

    colonnes_quantitatives_min = [col for col in colonnes_quantitatives if df[col].nunique() <= 130]
    colonnes_quantitatives_max = [col for col in colonnes_quantitatives if df[col].nunique() > 130]
    colonnes_qualitatives_min = [col for col in colonnes_qualitatives if df[col].nunique() <= 5]
    colonnes_qualitatives_max = [col for col in colonnes_qualitatives if df[col].nunique() > 5]

    transformation_rules = {
        "quantitatives_min": colonnes_quantitatives_min,
        "quantitatives_max": colonnes_quantitatives_max,
        "qualitatives_min": colonnes_qualitatives_min,
        "qualitatives_max": colonnes_qualitatives_max
    }
    return transformation_rules

find_transforms(df_train)

{'quantitatives_min': ['MSSubClass',
  'LotFrontage',
  'OverallQual',
  'OverallCond',
  'YearBuilt',
  'YearRemodAdd',
  'LowQualFinSF',
  'BsmtFullBath',
  'BsmtHalfBath',
  'FullBath',
  'HalfBath',
  'BedroomAbvGr',
  'KitchenAbvGr',
  'TotRmsAbvGrd',
  'Fireplaces',
  'GarageYrBlt',
  'GarageCars',
  'EnclosedPorch',
  '3SsnPorch',
  'ScreenPorch',
  'PoolArea',
  'MiscVal',
  'MoSold',
  'YrSold'],
 'quantitatives_max': ['Id',
  'LotArea',
  'MasVnrArea',
  'BsmtFinSF1',
  'BsmtFinSF2',
  'BsmtUnfSF',
  'TotalBsmtSF',
  '1stFlrSF',
  '2ndFlrSF',
  'GrLivArea',
  'GarageArea',
  'WoodDeckSF',
  'OpenPorchSF',
  'SalePrice'],
 'qualitatives_min': ['MSZoning',
  'Street',
  'Alley',
  'LotShape',
  'LandContour',
  'Utilities',
  'LotConfig',
  'LandSlope',
  'BldgType',
  'MasVnrType',
  'ExterQual',
  'ExterCond',
  'BsmtQual',
  'BsmtCond',
  'BsmtExposure',
  'HeatingQC',
  'CentralAir',
  'Electrical',
  'KitchenQual',
  'FireplaceQu',
  'GarageFinish',
  'GarageQual',
  'Gara