In [1]:
import pandas as pd
df = pd.read_csv("df_clean.csv")
df.head()



Unnamed: 0,BuildingType,PrimaryPropertyType,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,...,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUIWN(kBtu/sf),SteamUse(kBtu)
0,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.6122,-122.33799,1927,1.0,12,...,Hotel,88434.0,Unknown,32095.931352,Unknown,12371.421298,Unknown,60.0,84.300003,2003882.0
1,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61317,-122.33393,1996,1.0,11,...,Hotel,83880.0,Parking,15064.0,Restaurant,4622.0,Unknown,61.0,97.900002,0.0
2,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61393,-122.3381,1969,1.0,41,...,Hotel,756493.0,Unknown,32095.931352,Unknown,12371.421298,Unknown,43.0,97.699997,21566554.0
3,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61412,-122.33664,1926,1.0,10,...,Hotel,61320.0,Unknown,32095.931352,Unknown,12371.421298,Unknown,56.0,113.300003,2214446.25
4,NonResidential,Hotel,98121.0,7,DOWNTOWN,47.61375,-122.34047,1980,1.0,18,...,Hotel,123445.0,Parking,68009.0,Swimming Pool,0.0,Unknown,75.0,118.699997,0.0


### Features prévues


1. Âge du bâtiment

Formule : 2025 - YearBuilt

Pourquoi : L’année brute n’aide pas vraiment le modèle. L’âge donne une idée de la vétusté, donc de la qualité probable de l’isolation.

Info : Les bâtiments anciens consomment souvent plus (SiteEUI plus élevé).

2. Nombre d’usages du bâtiment

Formule : compter le nombre d’usages présents dans ListOfAllPropertyUseTypes.

Pourquoi : Un bâtiment multi-usage (ex : commerce + bureau) est plus compliqué énergétiquement qu’un bâtiment mono-usage.

Info : Les combinaisons d’usages changent les comportements de consommation.

3. Ratio parking / surface totale

Formule : PropertyGFAParking / PropertyGFATotal

Pourquoi : Le parking n’est presque pas chauffé/climatisé. S’il occupe une grande surface, ça influence la consommation rapportée au m².

Info : Deux grands bâtiments peuvent sembler différents juste à cause d’un parking énorme.

4. Part de l’usage principal

Formule : LargestPropertyUseTypeGFA / PropertyGFATotal

Pourquoi : Si un bâtiment est composé à 90 % de bureaux ou seulement à 40 %, ce n’est pas la même chose en termes d’énergie.

Info : La surface dominante influence beaucoup la consommation globale.

1. Age du batiment

In [2]:
df["Agedubatiment"] = 2025 - df["YearBuilt"]


In [3]:
df.columns


Index(['BuildingType', 'PrimaryPropertyType', 'ZipCode', 'CouncilDistrictCode',
       'Neighborhood', 'Latitude', 'Longitude', 'YearBuilt',
       'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal',
       'PropertyGFAParking', 'PropertyGFABuilding(s)',
       'ListOfAllPropertyUseTypes', 'LargestPropertyUseType',
       'LargestPropertyUseTypeGFA', 'SecondLargestPropertyUseType',
       'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
       'ThirdLargestPropertyUseTypeGFA', 'YearsENERGYSTARCertified',
       'ENERGYSTARScore', 'SiteEUIWN(kBtu/sf)', 'SteamUse(kBtu)',
       'Agedubatiment'],
      dtype='object')

2. Nombre d’usages du bâtiment

In [4]:
def count_usages(value):
    if pd.isna(value):
        return 0
    else:
        return len(value.split(','))

df["UsageCount"] = df["ListOfAllPropertyUseTypes"].apply(count_usages)

df["UsageCount"].value_counts()


UsageCount
1     693
2     485
3     197
4      79
5      40
6      18
7       4
9       2
11      1
13      1
8       1
Name: count, dtype: int64

3. Ratio parking / surface totale


In [5]:
df["ParkingRatio"] = df["PropertyGFAParking"] / df["PropertyGFATotal"]


df["ParkingRatio"].describe()


count    1521.000000
mean        0.065445
std         0.144734
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         0.895023
Name: ParkingRatio, dtype: float64

4. Part de l’usage principal


In [6]:
df["PrimaryUseRatio"] = df["LargestPropertyUseTypeGFA"] / df["PropertyGFATotal"]

df["PrimaryUseRatio"].describe()


count    1521.000000
mean        0.863346
std         0.319024
min         0.186469
25%         0.669596
50%         0.940499
75%         1.000000
max         6.426849
Name: PrimaryUseRatio, dtype: float64

In [7]:
df.to_csv("df_features.csv", index=False)


In [8]:
df.head()


Unnamed: 0,BuildingType,PrimaryPropertyType,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,...,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUIWN(kBtu/sf),SteamUse(kBtu),Agedubatiment,UsageCount,ParkingRatio,PrimaryUseRatio
0,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.6122,-122.33799,1927,1.0,12,...,Unknown,12371.421298,Unknown,60.0,84.300003,2003882.0,98,1,0.0,1.0
1,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61317,-122.33393,1996,1.0,11,...,Restaurant,4622.0,Unknown,61.0,97.900002,0.0,29,3,0.145453,0.809918
2,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61393,-122.3381,1969,1.0,41,...,Unknown,12371.421298,Unknown,43.0,97.699997,21566554.0,56,1,0.205748,0.79122
3,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61412,-122.33664,1926,1.0,10,...,Unknown,12371.421298,Unknown,56.0,113.300003,2214446.25,99,1,0.0,1.0
4,NonResidential,Hotel,98121.0,7,DOWNTOWN,47.61375,-122.34047,1980,1.0,18,...,Swimming Pool,0.0,Unknown,75.0,118.699997,0.0,45,3,0.353115,0.70307


In [9]:
import os

os.listdir()


['.git',
 'Feature NB.ipynb',
 'Notebook P6.ipynb',
 'Data',
 '.venv',
 '.ipynb_checkpoints',
 '.gitignore',
 'df_features.csv',
 'df_clean.csv']