In [5]:
import os
os.listdir()




['.ipynb_checkpoints',
 '2015-building-energy-benchmarking.csv',
 '2015-building-energy-benchmarking.csv.zip',
 'Untitled.ipynb']

In [15]:


import pandas as pd

# Utilisez le chemin de votre fichier C:
chemin_du_fichier = "C:\\Users\\FX506\\Downloads\\Projet ML\\2015-building-energy-benchmarkinga.csv"

# Lecture du fichier CSV
try:
    df = pd.read_csv(chemin_du_fichier)
    print("Fichier chargé avec succès !")
    # Affichez les premières lignes pour vérifier
    print(df.head())
except FileNotFoundError:
    print(f"Erreur : Le fichier n'a pas été trouvé à l'emplacement : {chemin_du_fichier}")

# Colonnes inutiles à supprimer
cols_to_drop = [
    'OSEBuildingID',
    'TaxParcelIdentificationNumber',
    'PropertyName',
    'Location',
    'CouncilDistrictCode',
    'Neighborhood',
    'ListOfAllPropertyUseTypes',
    'City Council Districts',
    'SPD Beats',
    'Seattle Police Department Micro Community Policing Plan Areas',
    'Zip Codes'
]

df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# Suppression des colonnes presque vides (plus de 50 % de NaN)
threshold = 0.5
df = df[df.columns[df.isnull().mean() < threshold]]

# Séparation numérique / catégorielle
num_cols = df.select_dtypes(include=['float64','int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Remplacement des NaN
df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna("Unknown")

# Vérification
df.info()
df.head()


Fichier chargé avec succès !
   OSEBuildingID  DataYear    BuildingType PrimaryPropertyType  \
0              1      2015  NonResidential               Hotel   
1              2      2015  NonResidential               Hotel   
2              3      2015  NonResidential               Hotel   
3              5      2015  NonResidential               Hotel   
4              8      2015  NonResidential               Hotel   

            PropertyName TaxParcelIdentificationNumber  \
0   MAYFLOWER PARK HOTEL                     659000030   
1        PARAMOUNT HOTEL                     659000220   
2           WESTIN HOTEL                     659000475   
3              HOTEL MAX                     659000640   
4  WARWICK SEATTLE HOTEL                     659000970   

                                            Location  CouncilDistrictCode  \
0  {'latitude': '47.61219025', 'longitude': '-122...                    7   
1  {'latitude': '47.61310583', 'longitude': '-122...                   

Unnamed: 0,DataYear,BuildingType,PrimaryPropertyType,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),LargestPropertyUseType,...,SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),OtherFuelUse(kBtu),GHGEmissions(MetricTonsCO2e),GHGEmissionsIntensity(kgCO2e/ft2),DefaultData,ComplianceStatus
0,2015,NonResidential,Hotel,1927,1,12.0,88434,0,88434,Hotel,...,2023032.0,1080307.0,3686160.0,12724.0,1272388.0,0.0,249.43,2.64,No,Compliant
1,2015,NonResidential,Hotel,1996,1,11.0,103566,15064,88502,Hotel,...,0.0,1144563.0,3905411.0,44490.0,4448985.0,0.0,263.51,2.38,No,Compliant
2,2015,NonResidential,Hotel,1969,1,41.0,961990,0,961990,Hotel,...,19660404.0,14583930.0,49762435.0,37099.0,3709900.0,0.0,2061.48,1.92,Yes,Compliant
3,2015,NonResidential,Hotel,1926,1,10.0,61320,0,61320,Hotel,...,23458518.0,811521.0,2769023.0,20019.0,2001894.0,0.0,1936.34,31.38,No,Compliant
4,2015,NonResidential,Hotel,1980,1,18.0,119890,12460,107430,Hotel,...,0.0,1777841.0,6066245.0,87631.0,8763105.0,0.0,507.7,4.02,No,Compliant


In [17]:
# Méthode 1 (Recommandée) : Utiliser .shape
nombre_de_lignes_shape = df.shape[0]

# Méthode 2 : Utiliser len()
nombre_de_lignes_len = len(df)

print(f"Le nombre de lignes (méthode .shape) est : {nombre_de_lignes_shape}")
print(f"Le nombre de lignes (méthode len()) est : {nombre_de_lignes_len}")

Le nombre de lignes (méthode .shape) est : 3340
Le nombre de lignes (méthode len()) est : 3340


In [19]:
# df est votre DataFrame qui contient les modifications
nom_du_nouveau_fichier = "DataSetClean.csv"

# Enregistre le DataFrame dans un nouveau fichier CSV
df.to_csv(nom_du_nouveau_fichier, index=False)

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# -------------------------------
# 1. Séparation X / y
# -------------------------------
target = "SiteEnergyUse(kBtu)"

X = df.drop(columns=[target])
y = df[target]

# -------------------------------
# 2. Identifier colonnes numériques / catégorielles
# -------------------------------
num_cols = X.select_dtypes(include=['int64','float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

print("Colonnes numériques :", list(num_cols))
print("Colonnes catégorielles :", list(cat_cols))

# -------------------------------
# 3. Préprocessing : OneHot + Scaling
# -------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

# -------------------------------
# 4. Modèle choisi → Random Forest (robuste pour début)
# -------------------------------
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42
)

# -------------------------------
# 5. Pipeline
# -------------------------------
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', model)
])

# -------------------------------
# 6. Train/Test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# -------------------------------
# 7. Entraînement
# -------------------------------
pipeline.fit(X_train, y_train)

# -------------------------------
# 8. Score de base
# -------------------------------
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

train_score, test_score


Colonnes numériques : ['DataYear', 'YearBuilt', 'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal', 'PropertyGFAParking', 'PropertyGFABuilding(s)', 'LargestPropertyUseTypeGFA', 'ENERGYSTARScore', 'SiteEUI(kBtu/sf)', 'SiteEUIWN(kBtu/sf)', 'SourceEUI(kBtu/sf)', 'SourceEUIWN(kBtu/sf)', 'SiteEnergyUseWN(kBtu)', 'SteamUse(kBtu)', 'Electricity(kWh)', 'Electricity(kBtu)', 'NaturalGas(therms)', 'NaturalGas(kBtu)', 'OtherFuelUse(kBtu)', 'GHGEmissions(MetricTonsCO2e)', 'GHGEmissionsIntensity(kgCO2e/ft2)']
Colonnes catégorielles : ['BuildingType', 'PrimaryPropertyType', 'LargestPropertyUseType', 'DefaultData', 'ComplianceStatus']


(0.9810893082515402, 0.8810712243259549)

In [23]:
import numpy as np

# Récupérer le modèle RandomForest entraîné
rf = pipeline.named_steps['model']

# Récupérer les noms de features après OneHotEncoding
ohe = pipeline.named_steps['preprocess'].named_transformers_['cat']
cat_feature_names = ohe.get_feature_names_out(cat_cols)

# Concaténer les colonnes numériques + colonnes encodées
final_feature_names = list(num_cols) + list(cat_feature_names)

# Importance
importances = rf.feature_importances_

# Trier par importance décroissante
indices = np.argsort(importances)[::-1]
sorted_features = [(final_feature_names[i], importances[i]) for i in indices]

# Afficher les 20 features les plus importantes
for feature, score in sorted_features[:20]:
    print(f"{feature}: {score:.4f}")


SiteEnergyUseWN(kBtu): 0.7714
Electricity(kWh): 0.0662
Electricity(kBtu): 0.0605
NaturalGas(therms): 0.0226
NaturalGas(kBtu): 0.0209
GHGEmissions(MetricTonsCO2e): 0.0207
SiteEUIWN(kBtu/sf): 0.0057
NumberofFloors: 0.0052
SourceEUI(kBtu/sf): 0.0036
LargestPropertyUseType_Data Center: 0.0031
SourceEUIWN(kBtu/sf): 0.0027
LargestPropertyUseTypeGFA: 0.0021
PropertyGFATotal: 0.0020
PropertyGFABuilding(s): 0.0018
GHGEmissionsIntensity(kgCO2e/ft2): 0.0015
SiteEUI(kBtu/sf): 0.0015
NumberofBuildings: 0.0013
BuildingType_NonResidential: 0.0011
PropertyGFAParking: 0.0011
BuildingType_Campus: 0.0010


In [25]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'model__n_estimators': [200, 400, 600],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

search = RandomizedSearchCV(
    pipeline,
    params,
    n_iter=15,
    scoring='r2',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best params:", search.best_params_)
print("Train score:", search.best_score_)
print("Test score:", search.score(X_test, y_test))



Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best params: {'model__n_estimators': 600, 'model__min_samples_split': 5, 'model__min_samples_leaf': 2, 'model__max_depth': 30}
Train score: 0.8936958999810525
Test score: 0.8919062672764052
