In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score  # For classification
from sklearn.preprocessing import LabelEncoder

In [2]:
test = pd.read_csv("datasets/test.csv")
train = pd.read_csv("datasets/train.csv")

In [3]:
test.shape, train.shape

((500000, 40), (2500000, 40))

In [4]:
label_encoder = LabelEncoder()

In [5]:
X_train = train.drop("Etiquette_DPE", axis='columns')
y_train = label_encoder.fit_transform(train["Etiquette_DPE"])
X_test = test.drop("Etiquette_DPE", axis='columns')
y_test = label_encoder.fit_transform(test['Etiquette_DPE'])

In [6]:
numerical_cols = X_test.select_dtypes(include=['number']).columns.to_list()
categorical_cols = X_test.select_dtypes(include=['object', 'category']).columns.to_list()

In [7]:
X_train = X_train[numerical_cols[1:]]
X_test = X_test[numerical_cols[1:]]

In [8]:
X_train.shape, X_test.shape

((2500000, 17), (500000, 17))

In [9]:
X_train.head()

Unnamed: 0,Facteur_couverture_solaire_saisi,Surface_habitable_desservie_par_installation_ECS,Emission_GES_éclairage,Conso_5_usages_é_finale_énergie_n°2,Surface_totale_capteurs_photovoltaïque,Conso_chauffage_dépensier_installation_chauffage_n°1,Coût_chauffage_énergie_n°2,Emission_GES_chauffage_énergie_n°2,Code_postal_(brut),Facteur_couverture_solaire,Année_construction,Code_postal_(BAN),Conso_5_usages/m²_é_finale,Conso_5_usages_é_finale,Hauteur_sous-plafond,Surface_habitable_immeuble,Surface_habitable_logement
0,,110.4,28.3,10676.1,,39870.0,342.0,0.0,25000,,1945.0,25000.0,171.9,37940.5,2.9,220.8,220.8
1,,83.5,10.6,8678.1,26.4,17902.3,1229.2,577.4,44850,,,44850.0,221.0,18484.5,2.5,,83.5
2,,68.0,8.9,,,7791.8,,,35560,,1983.0,35560.0,130.5,8873.0,2.5,,68.0
3,,,1.8,2576.8,,,0.0,0.0,44800,,1976.0,44800.0,162.6,7729.5,2.5,3292.3,47.5
4,,142.5,18.6,852.8,,48799.7,0.0,0.0,28200,,1945.0,28200.0,303.8,43275.4,2.8,,142.5


In [10]:
model = xgb.XGBClassifier()  

In [11]:
model.fit(X_train, y_train, verbose=True)

In [12]:
predictions = model.predict(X_test)

In [13]:
# For classification
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 97.62%


In [14]:
val = pd.read_csv("datasets/val.csv")
val.head()

Unnamed: 0,N°DPE,Configuration_installation_chauffage_n°2,Facteur_couverture_solaire_saisi,Surface_habitable_desservie_par_installation_ECS,Emission_GES_éclairage,Cage_d'escalier,Conso_5_usages_é_finale_énergie_n°2,Type_générateur_froid,Type_émetteur_installation_chauffage_n°2,Surface_totale_capteurs_photovoltaïque,...,Qualité_isolation_enveloppe,Qualité_isolation_menuiseries,Qualité_isolation_murs,Qualité_isolation_plancher_bas,Qualité_isolation_plancher_haut_comble_aménagé,Qualité_isolation_plancher_haut_comble_perdu,Qualité_isolation_plancher_haut_toit_terrase,Surface_habitable_immeuble,Surface_habitable_logement,Type_bâtiment
0,2289E0123978W,,,35.0,4.4,,,,,,...,insuffisante,insuffisante,insuffisante,insuffisante,,insuffisante,,,35.0,maison
1,2274E1764260B,,,85.4,11.0,,,,,,...,très bonne,moyenne,bonne,très bonne,,très bonne,,,85.4,appartement
2,2206E2755246H,,,114.0,29.8,,,PAC air/air installée entre 2008 et 2014,,,...,bonne,insuffisante,bonne,bonne,,moyenne,,,227.9,maison
3,2211E2030092Y,,,39.0,5.1,,,,,,...,insuffisante,moyenne,insuffisante,très bonne,,très bonne,,,39.0,appartement
4,2290E0952951M,,,120.4,15.1,,,,,,...,très bonne,très bonne,très bonne,bonne,très bonne,,,,120.4,maison


In [15]:
valuate_data = val[numerical_cols[1:]]

In [16]:
valuate_data.head()

Unnamed: 0,Facteur_couverture_solaire_saisi,Surface_habitable_desservie_par_installation_ECS,Emission_GES_éclairage,Conso_5_usages_é_finale_énergie_n°2,Surface_totale_capteurs_photovoltaïque,Conso_chauffage_dépensier_installation_chauffage_n°1,Coût_chauffage_énergie_n°2,Emission_GES_chauffage_énergie_n°2,Code_postal_(brut),Facteur_couverture_solaire,Année_construction,Code_postal_(BAN),Conso_5_usages/m²_é_finale,Conso_5_usages_é_finale,Hauteur_sous-plafond,Surface_habitable_immeuble,Surface_habitable_logement
0,,35.0,4.4,,,11491.2,,,89200,,1947.0,89200.0,307.0,10777.1,2.5,,35.0
1,,85.4,11.0,,,2892.4,,,74940,,,74600.0,57.0,4932.8,2.5,,85.4
2,,114.0,29.8,,,6791.3,,,6410,,,6410.0,45.0,10307.1,2.5,,227.9
3,,39.0,5.1,,,2081.6,,,11000,,,11000.0,81.0,3159.4,3.5,,39.0
4,,120.4,15.1,,,2024.7,,,90400,,2019.0,90400.0,36.0,4430.8,2.5,,120.4


In [17]:
predictions2 = model.predict(valuate_data)

In [20]:
predictions2

array([6, 2, 1, ..., 1, 3, 2])

In [22]:
df = pd.DataFrame(columns=['Etiquette_DPE'])

In [24]:
df['Etiquette_DPE'] = predictions2

In [26]:
df['Etiquette_DPE'] = df['Etiquette_DPE'].map({
    0: 'A',
    1: 'B',
    2: 'C',
    3: 'D',
    4: 'E',
    5: 'F',
    6: 'G'
})

In [28]:
df["N°DPE"] = val["N°DPE"]

In [30]:
df = df[["N°DPE", "Etiquette_DPE"]]

In [31]:
df.head()

Unnamed: 0,N°DPE,Etiquette_DPE
0,2289E0123978W,G
1,2274E1764260B,C
2,2206E2755246H,B
3,2211E2030092Y,D
4,2290E0952951M,B


In [32]:
df.to_csv("result.csv", index=False, encoding="utf-8")

In [33]:
model.save("model.keras")

AttributeError: 'XGBClassifier' object has no attribute 'save'

In [34]:
import pickle

model_filepath = 'xgboost_model.pkl'
# Save the model to a file using pickle
with open(model_filepath, 'wb') as model_file:
    pickle.dump(model, model_file)
print(f"XGBoost model saved to {model_filepath}")

XGBoost model saved to xgboost_model.pkl


In [ ]:
# Load the saved XGBoost model
with open(model_filepath, 'rb') as model_file:
    loaded_model = pickle.load(model_file)

# Use the loaded model for predictions
# predictions = loaded_model.predict(X_test)