In [1]:
import pandas as pd
df = pd.read_csv("df_clean.csv")
df.head()



Unnamed: 0,BuildingType,PrimaryPropertyType,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,ListOfAllPropertyUseTypes,LargestPropertyUseType,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,SiteEUIWN(kBtu/sf)
0,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.6122,-122.33799,1927,1.0,12,88434,0,Hotel,Hotel,Unknown,31733.045429,Unknown,12311.108329,84.300003
1,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61317,-122.33393,1996,1.0,11,103566,15064,"Hotel, Parking, Restaurant",Hotel,Parking,15064.0,Restaurant,4622.0,97.900002
2,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61393,-122.3381,1969,1.0,41,956110,196718,Hotel,Hotel,Unknown,31733.045429,Unknown,12311.108329,97.699997
3,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61412,-122.33664,1926,1.0,10,61320,0,Hotel,Hotel,Unknown,31733.045429,Unknown,12311.108329,113.300003
4,NonResidential,Hotel,98121.0,7,DOWNTOWN,47.61375,-122.34047,1980,1.0,18,175580,62000,"Hotel, Parking, Swimming Pool",Hotel,Parking,68009.0,Swimming Pool,0.0,118.699997


### Features prévues


1. Âge du bâtiment

Formule : 2025 - YearBuilt

Pourquoi : L’année brute n’aide pas vraiment le modèle. L’âge donne une idée de la vétusté, donc de la qualité probable de l’isolation.

Info : Les bâtiments anciens consomment souvent plus (SiteEUI plus élevé).

2. Nombre d’usages du bâtiment

Formule : compter le nombre d’usages présents dans ListOfAllPropertyUseTypes.

Pourquoi : Un bâtiment multi-usage (ex : commerce + bureau) est plus compliqué énergétiquement qu’un bâtiment mono-usage.

Info : Les combinaisons d’usages changent les comportements de consommation.

3. Ratio parking / surface totale

Formule : PropertyGFAParking / PropertyGFATotal

Pourquoi : Le parking n’est presque pas chauffé/climatisé. S’il occupe une grande surface, ça influence la consommation rapportée au m².

Info : Deux grands bâtiments peuvent sembler différents juste à cause d’un parking énorme.



1. Age du batiment

In [2]:
df["Agedubatiment"] = 2025 - df["YearBuilt"]


In [3]:
df.columns


Index(['BuildingType', 'PrimaryPropertyType', 'ZipCode', 'CouncilDistrictCode',
       'Neighborhood', 'Latitude', 'Longitude', 'YearBuilt',
       'NumberofBuildings', 'NumberofFloors', 'PropertyGFATotal',
       'PropertyGFAParking', 'ListOfAllPropertyUseTypes',
       'LargestPropertyUseType', 'SecondLargestPropertyUseType',
       'SecondLargestPropertyUseTypeGFA', 'ThirdLargestPropertyUseType',
       'ThirdLargestPropertyUseTypeGFA', 'SiteEUIWN(kBtu/sf)',
       'Agedubatiment'],
      dtype='object')

2. Nombre d’usages du bâtiment

In [4]:
def count_usages(value):
    if pd.isna(value):
        return 0
    else:
        return len(value.split(','))

df["UsageCount"] = df["ListOfAllPropertyUseTypes"].apply(count_usages)

df["UsageCount"].value_counts()


UsageCount
1     687
2     481
3     197
4      79
5      39
6      18
7       4
9       2
11      1
13      1
8       1
Name: count, dtype: int64

3. Ratio parking / surface totale


In [5]:
df["ParkingRatio"] = df["PropertyGFAParking"] / df["PropertyGFATotal"]


df["ParkingRatio"].describe()


count    1510.000000
mean        0.064795
std         0.144019
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         0.895023
Name: ParkingRatio, dtype: float64

In [6]:
df.to_csv("df_features.csv", index=False)


In [7]:
df.head()


Unnamed: 0,BuildingType,PrimaryPropertyType,ZipCode,CouncilDistrictCode,Neighborhood,Latitude,Longitude,YearBuilt,NumberofBuildings,NumberofFloors,...,ListOfAllPropertyUseTypes,LargestPropertyUseType,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,SiteEUIWN(kBtu/sf),Agedubatiment,UsageCount,ParkingRatio
0,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.6122,-122.33799,1927,1.0,12,...,Hotel,Hotel,Unknown,31733.045429,Unknown,12311.108329,84.300003,98,1,0.0
1,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61317,-122.33393,1996,1.0,11,...,"Hotel, Parking, Restaurant",Hotel,Parking,15064.0,Restaurant,4622.0,97.900002,29,3,0.145453
2,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61393,-122.3381,1969,1.0,41,...,Hotel,Hotel,Unknown,31733.045429,Unknown,12311.108329,97.699997,56,1,0.205748
3,NonResidential,Hotel,98101.0,7,DOWNTOWN,47.61412,-122.33664,1926,1.0,10,...,Hotel,Hotel,Unknown,31733.045429,Unknown,12311.108329,113.300003,99,1,0.0
4,NonResidential,Hotel,98121.0,7,DOWNTOWN,47.61375,-122.34047,1980,1.0,18,...,"Hotel, Parking, Swimming Pool",Hotel,Parking,68009.0,Swimming Pool,0.0,118.699997,45,3,0.353115


In [8]:
import os

os.listdir()


['bentofile.yaml',
 '__pycache__',
 '.git',
 'Feature NB.ipynb',
 'Notebook P6.ipynb',
 'service.py',
 'Data',
 '.venv',
 'TrainTest.ipynb',
 '.ipynb_checkpoints',
 '.gitignore',
 'df_features.csv',
 'df_clean.csv']