In [44]:
import pandas as pd
df = pd.read_csv("df_clean.csv")
df.head()



Unnamed: 0,BuildingType,PrimaryPropertyType,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,...,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUIWN(kBtu/sf),SteamUse(kBtu)
0,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.087917,-0.1914,-1.046321,-0.085712,1.386984,...,Hotel,0.048366,Unknown,0.0,Unknown,-1.286201e-16,Unknown,-0.136862,84.300003,0.638124
1,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.067236,-0.016761,1.055313,-0.085712,1.213168,...,Hotel,0.008998,Parking,-0.407434,Restaurant,-0.5479589,Unknown,-0.093561,97.900002,-0.127222
2,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.051033,-0.196131,0.232934,-0.085712,6.427641,...,Hotel,5.823632,Unknown,0.0,Unknown,-1.286201e-16,Unknown,-0.872981,97.699997,8.10973
3,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.046983,-0.13333,-1.07678,-0.085712,1.039352,...,Hotel,-0.18603,Unknown,0.0,Unknown,-1.286201e-16,Unknown,-0.310066,113.300003,0.718546
4,NonResidential,Hotel,0.23615,1.179727,DOWNTOWN,-0.054871,-0.298076,0.567977,-0.085712,2.429878,...,Hotel,0.351031,Parking,0.859104,Swimming Pool,-0.8747789,Unknown,0.512655,118.699997,-0.127222


### Features prévues


1. Âge du bâtiment

Formule : 2024 - YearBuilt

Pourquoi : L’année brute n’aide pas vraiment le modèle. L’âge donne une idée de la vétusté, donc de la qualité probable de l’isolation.

Info : Les bâtiments anciens consomment souvent plus (SiteEUI plus élevé).

2. Nombre d’usages du bâtiment

Formule : compter le nombre d’usages présents dans ListOfAllPropertyUseTypes.

Pourquoi : Un bâtiment multi-usage (ex : commerce + bureau) est plus compliqué énergétiquement qu’un bâtiment mono-usage.

Info : Les combinaisons d’usages changent les comportements de consommation.

3. Ratio parking / surface totale

Formule : PropertyGFAParking / PropertyGFATotal

Pourquoi : Le parking n’est presque pas chauffé/climatisé. S’il occupe une grande surface, ça influence la consommation rapportée au m².

Info : Deux grands bâtiments peuvent sembler différents juste à cause d’un parking énorme.

4. Part de l’usage principal

Formule : LargestPropertyUseTypeGFA / PropertyGFATotal

Pourquoi : Si un bâtiment est composé à 90 % de bureaux ou seulement à 40 %, ce n’est pas la même chose en termes d’énergie.

Info : La surface dominante influence beaucoup la consommation globale.

1. Age du batiment

In [45]:
df["Agedubatiment"] = 2025 - df["YearBuilt"]


In [46]:
df.columns


Index(['BuildingType', 'PrimaryPropertyType', 'ZipCode', 'CouncilDistrictCode',
       'Neighborhood', 'Latitude', 'Longitude', 'YearBuilt',
       'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal',
       'PropertyGFAParking', 'PropertyGFABuilding(s)',
       'ListOfAllPropertyUseTypes', 'LargestPropertyUseType',
       'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType',
       'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
       'ThirdLargestPropertyUseTypeGFA', 'YearsENERGYSTARCertified',
       'ENERGYSTARScore', 'SiteEUIWN(kBtu/sf)', 'SteamUse(kBtu)',
       'Agedubatiment'],
      dtype='object')

2. Nombre d’usages du bâtiment

In [47]:
def count_usages(value):
    if pd.isna(value):
        return 0
    else:
        return len(value.split(','))

df["UsageCount"] = df["ListOfAllPropertyUseTypes"].apply(count_usages)

df["UsageCount"].value_counts()


UsageCount
1     693
2     485
3     197
4      79
5      40
6      18
7       4
9       2
11      1
13      1
8       1
Name: count, dtype: int64

3. Ratio parking / surface totale


In [48]:
df["ParkingRatio"] = df["PropertyGFAParking"] / df["PropertyGFATotal"]


df["ParkingRatio"].describe()


count    1521.000000
mean        0.610464
std         8.270967
min      -149.512892
25%         0.567631
50%         0.644264
75%         0.932823
max       143.691492
Name: ParkingRatio, dtype: float64

4. Part de l’usage principal


In [49]:
df["PrimaryUseRatio"] = df["LargestPropertyUseTypeGFA"] / df["PropertyGFATotal"]

df["PrimaryUseRatio"].describe()


count    1521.000000
mean        1.212645
std         6.230674
min       -52.818912
25%         0.795634
50%         0.925973
75%         1.138282
max       188.454730
Name: PrimaryUseRatio, dtype: float64

In [50]:
df.to_csv("df_features.csv", index=False)


In [51]:
df.head()


Unnamed: 0,BuildingType,PrimaryPropertyType,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,...,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUIWN(kBtu/sf),SteamUse(kBtu),Agedubatiment,UsageCount,ParkingRatio,PrimaryUseRatio
0,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.087917,-0.1914,-1.046321,-0.085712,1.386984,...,Unknown,-1.286201e-16,Unknown,-0.136862,84.300003,0.638124,2026.046321,1,3.529782,-0.519369
1,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.067236,-0.016761,1.055313,-0.085712,1.213168,...,Restaurant,-0.5479589,Unknown,-0.093561,97.900002,-0.127222,2023.944687,3,4.607325,0.618338
2,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.051033,-0.196131,0.232934,-0.085712,6.427641,...,Unknown,-1.286201e-16,Unknown,-0.872981,97.699997,8.10973,2024.767066,1,0.795806,0.957655
3,NonResidential,Hotel,-0.849564,1.179727,DOWNTOWN,-0.046983,-0.13333,-1.07678,-0.085712,1.039352,...,Unknown,-1.286201e-16,Unknown,-0.310066,113.300003,0.718546,2026.07678,1,1.149084,0.650306
4,NonResidential,Hotel,0.23615,1.179727,DOWNTOWN,-0.054871,-0.298076,0.567977,-0.085712,2.429878,...,Swimming Pool,-0.8747789,Unknown,0.512655,118.699997,-0.127222,2024.432023,3,2.467078,0.666102
