# Objectif

Finir l'exploitation du *vrai* dataset commencée aux séances 09 et 10.

In [30]:
from requests import get
import pandas as pd
import json
import re
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.optimize import minimize_scalar, minimize, basinhopping

In [4]:
ADRESSE = "https://raw.githubusercontent.com/VPerrollaz/immobilier/master/donnees/brute.json"
NEUF = {'Appartement neuf', 'Maison / Villa neuve'}
MAISON =  {'Maison / Villa', 'Maison / Villa neuve'}
GENRES_VALIDES = {'Appartement', 'Appartement neuf', 'Maison / Villa', 'Maison / Villa neuve'}
MOTIF_SURFACE = re.compile("^.*?([0-9]+(,[0-9]+)?) m².*$")
MOTIF_PIECES = re.compile("^.*?([0-9]+) p.*$")
MOTIF_CHAMBRES = re.compile("^.*?([0-9]+) ch.*$")

In [5]:
# génération du dataframe
page = get(ADRESSE)
contenu = page.text
data = [json.loads(ligne) for ligne in contenu.splitlines()]
df = pd.DataFrame(data=data)

# gestion id
df.drop_duplicates(subset="id", inplace=True)
df.drop(columns="id", inplace=True)
df.reset_index(inplace=True, drop=True)

# gestion genre
df.drop(df[~df["genre"].isin(GENRES_VALIDES)].index, inplace=True)
df.reset_index(inplace=True, drop=True)
df["neuf"] = df["genre"].isin(NEUF)
df["maison"] = df["genre"].isin(MAISON)
df.drop(columns="genre", inplace=True)
df.reset_index(inplace=True, drop=True)

# gestion prix
df["prix_1"] = df["prix"].str.replace("€", "").str.replace(" ","").str.replace("HH", "")
df.drop(df[df["prix_1"] == ""].index, inplace=True)
df["target_prix"] = df["prix_1"].astype(float)
df.drop(columns="prix_1", inplace=True)
df.drop(columns="prix", inplace=True)
df.reset_index(inplace=True, drop=True)

# gestion pcs
df.drop(df[~df["pcs"].str.match(MOTIF_SURFACE)].index, inplace=True)
df.reset_index(inplace=True, drop=True)
df["surface"] = df["pcs"].str.extract(MOTIF_SURFACE)[0].str.replace(",", ".").astype(float)
df.reset_index(inplace=True, drop=True)
df["pieces"] = df["pcs"].str.extract(MOTIF_PIECES).astype(float)
df["chambres"] = df["pcs"].str.extract(MOTIF_CHAMBRES).astype(float)
df.drop(columns="pcs", inplace=True)
df

Unnamed: 0,desc,lien,neuf,maison,target_prix,surface,pieces,chambres
0,Appartement type 3 - TOURS CATHÉDRALE TOURS CA...,https://www.seloger.com/annonces/achat/apparte...,False,False,374400.0,90.00,3.0,2.0
1,TOURS HYPERCENTRE - Appartement TOURS HYPERCEN...,https://www.seloger.com/annonces/achat/apparte...,False,False,499200.0,146.27,5.0,4.0
2,TOURS PRÉBENDES NORD - APPARTEMENT TOURS PRÉBE...,https://www.seloger.com/annonces/achat/apparte...,False,False,499200.0,110.00,5.0,3.0
3,TOURS PRÉBENDES - PARTICULIER TOURANGEAUX TOUR...,https://www.seloger.com/annonces/achat/maison/...,False,True,508000.0,132.00,6.0,4.0
4,TOURS STRASBOURG / RABELAIS - Maison TOURS STR...,https://www.seloger.com/annonces/achat-de-pres...,False,True,676000.0,185.00,7.0,5.0
...,...,...,...,...,...,...,...,...
1620,Maisons de ville 3 chambres avec jardin ou app...,https://www.selogerneuf.com/annonces/achat/app...,True,False,254900.0,66.30,3.0,
1621,"TOURS - Fontaines, Appartement de type 2 compr...",https://www.seloger.com/annonces/achat/apparte...,False,False,61500.0,42.00,2.0,1.0
1622,TOURS - Appartement de Type 3 comprenant séjou...,https://www.seloger.com/annonces/achat/apparte...,False,False,108500.0,76.00,3.0,2.0
1623,Maisons de ville 3 chambres avec jardin ou app...,https://www.selogerneuf.com/annonces/achat/mai...,True,True,320000.0,84.40,4.0,


In [7]:
X = df[["neuf", "maison", "pieces", "surface", "chambres"]].astype(float).values
y = df["target_prix"].values
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2)

In [11]:
def affichage(modele):
    indice_meilleur = modele.cv_results_["rank_test_score"].argmin()
    print("Paramètres gagnants: ", modele.cv_results_["params"][indice_meilleur])
    print("Classement pour confirmation: ", modele.cv_results_["rank_test_score"][indice_meilleur])
    print("Score: ", modele.cv_results_["mean_test_score"][indice_meilleur])
    print("Déviation standard: ", modele.cv_results_["std_test_score"][indice_meilleur])

In [12]:
%%time
# entrainement ridge
ri = Pipeline(
    [
        ("imputation", SimpleImputer()),
        ("entrainement", Ridge()),
    ]
)
gr = GridSearchCV(
    estimator=ri,
    param_grid={
        "imputation__strategy": ["mean", "median", "most_frequent"],
        "entrainement__alpha": [2  ** p for p in range(-6, 7)],
    }
)
gr.fit(X_tr, y_tr)
affichage(gr)

Paramètres gagnants:  {'entrainement__alpha': 0.015625, 'imputation__strategy': 'mean'}
Classement pour confirmation:  1
Score:  0.6655342855820032
Déviation standard:  0.09186527011337503


In [13]:
%%time
# entrainement kneighbors
knr = Pipeline(
    [
        ("imputation", SimpleImputer()),
        ("entrainement", KNeighborsRegressor()),
    ]
)
gr = GridSearchCV(
    estimator=knr,
    param_grid={
        "imputation__strategy": ["mean", "median", "most_frequent"],
        "entrainement__n_neighbors": range(2, 15),
    }
)
gr.fit(X_tr, y_tr)
affichage(gr)

Paramètres gagnants:  {'entrainement__n_neighbors': 12, 'imputation__strategy': 'median'}
Classement pour confirmation:  1
Score:  0.7319512650907963
Déviation standard:  0.05357288058074905


In [24]:
%%time
# support vecteur
svr = Pipeline(
    [
        ("imputation", SimpleImputer()),
        ("echelle", StandardScaler()),
        ("entrainement", SVR()),
    ]
)
gr = GridSearchCV(
    estimator=svr,
    param_grid={
        "imputation__strategy": ["mean", "median", "most_frequent"],
        "entrainement__C": np.logspace(-4, 16, 15, base=2),
        "entrainement__epsilon": [0.1, 0.4, 0.7],
    }
)
gr.fit(X_tr, y_tr)
affichage(gr)

Paramètres gagnants:  {'entrainement__C': 65536.0, 'entrainement__epsilon': 0.1, 'imputation__strategy': 'median'}
Classement pour confirmation:  1
Score:  0.670975298741707
Déviation standard:  0.07033323003262947
CPU times: total: 49.2 s
Wall time: 49.2 s


In [42]:
%%time
for strategy in ["mean", "median", "most_frequent"]:
    for echelle in [MinMaxScaler, StandardScaler]:
        def a_minimiser(c: float) -> float:
            svr = Pipeline(
                [
                    ("imputation", SimpleImputer(strategy=strategy)),
                    ("echelle", echelle()),
                    ("entrainement", SVR(C=c)),
                ]
            )
            return -cross_val_score(svr, X_tr, y_tr).mean()
        resultat = minimize_scalar(fun=a_minimiser, bounds=(1, 1000000), method="bounded")
        print(strategy, str(echelle), resultat.x, -resultat.fun)
            


mean <class 'sklearn.preprocessing._data.MinMaxScaler'> 304218.64113668306 0.5269195923905998
mean <class 'sklearn.preprocessing._data.StandardScaler'> 57653.85855807888 0.5655105924060114
median <class 'sklearn.preprocessing._data.MinMaxScaler'> 293306.0627049666 0.5258126465636141
median <class 'sklearn.preprocessing._data.StandardScaler'> 58591.283068386474 0.5652880046372333
most_frequent <class 'sklearn.preprocessing._data.MinMaxScaler'> 293306.0627049666 0.5258126465636141
most_frequent <class 'sklearn.preprocessing._data.StandardScaler'> 58591.283068386474 0.5652880046372333


In [49]:
%%time

scale = MinMaxScaler()
print(y_tr.shape)
y_trr = scale.fit_transform(y_tr.reshape(-1, 1)).reshape(-1)

for strategy in ["mean", "median", "most_frequent"]:
    for echelle in [MinMaxScaler, StandardScaler]:
        def a_minimiser(c: float) -> float:
            svr = Pipeline(
                [
                    ("imputation", SimpleImputer(strategy=strategy)),
                    ("echelle", echelle()),
                    ("entrainement", SVR(C=c)),
                ]
            )
            return -cross_val_score(svr, X_tr, y_trr).mean()
        resultat = minimize_scalar(fun=a_minimiser, bounds=(0.1, 10), method="bounded")
        print(strategy, str(echelle), resultat.x, -resultat.fun)

mean <class 'sklearn.preprocessing._data.MinMaxScaler'> 3.603514518681985 0.48540884717048927
mean <class 'sklearn.preprocessing._data.StandardScaler'> 5.350086862523794 0.5108471815398209
median <class 'sklearn.preprocessing._data.MinMaxScaler'> 3.9822511763765593 0.4873436196405338
median <class 'sklearn.preprocessing._data.StandardScaler'> 5.283561331066544 0.5142652543338377
most_frequent <class 'sklearn.preprocessing._data.MinMaxScaler'> 3.9822511763765593 0.4873436196405338
most_frequent <class 'sklearn.preprocessing._data.StandardScaler'> 5.283561331066544 0.5142652543338377
CPU times: total: 12 s
Wall time: 12 s


In [21]:
%%time
# random forest
rfr= Pipeline(
    [
        ("imputation", SimpleImputer()),
        ("entrainement", RandomForestRegressor()),
    ]
)
gr = GridSearchCV(
    estimator=rfr,
    param_grid={
        "imputation__strategy": ["mean", "median", "most_frequent"],
        "entrainement__n_estimators": range(50, 200, 10),
        "entrainement__max_features": [None],
    }
)
gr.fit(X_tr, y_tr)
affichage(gr)

Paramètres gagnants:  {'entrainement__max_features': None, 'entrainement__n_estimators': 130, 'imputation__strategy': 'most_frequent'}
Classement pour confirmation:  1
Score:  0.680861765751208
Déviation standard:  0.028454032436913638
CPU times: total: 59.2 s
Wall time: 59.2 s


In [53]:
%%time
# gradient boosting
gbr = Pipeline(
    [
        ("imputation", SimpleImputer()),
        ("entrainement", GradientBoostingRegressor()),
    ]
)
gr = GridSearchCV(
    estimator=gbr,
    param_grid={
        "imputation__strategy": ["mean", "median", "most_frequent"],
        "entrainement__learning_rate": (0.005, 0.01, 0.1, 0.5),
        "entrainement__n_estimators": (50, 100, 150, 200, 400),
    }
)
gr.fit(X_tr, y_tr)
affichage(gr)

Paramètres gagnants:  {'entrainement__learning_rate': 0.01, 'entrainement__n_estimators': 200, 'imputation__strategy': 'most_frequent'}
Classement pour confirmation:  1
Score:  0.7002560409931854
Déviation standard:  0.06496757695108635
CPU times: total: 33.1 s
Wall time: 33.1 s


Code originale partant dans le décor pour les réseaux de neurones

```python
%%time
# réseaux de neurones
mlp  = Pipeline(
    [
        ("imputation", SimpleImputer()),
        ("echelle", MinMaxScaler()),
        ("entrainement", MLPRegressor()),
    ]
)
gr = GridSearchCV(
    estimator=mlp,
    param_grid={
        "imputation__strategy": ["mean", "median", "most_frequent"],
        "entrainement__hidden_layer_sizes": [(100,), (150,)],
        "entrainement__max_iter": [1000],
    }
)
gr.fit(X_tr, y_tr)
affichage(gr)
```

In [61]:
%%time
# réseaux de neurones
# On fait un scaling sur la sortie aussi

scale = MinMaxScaler()
y_trr = scale.fit_transform(y_tr.reshape(-1, 1)).reshape(-1)

mlp  = Pipeline(
    [
        ("imputation", SimpleImputer()),
        ("echelle", MinMaxScaler()),
        ("entrainement", MLPRegressor()),
    ]
)
gr = GridSearchCV(
    estimator=mlp,
    param_grid={
        "imputation__strategy": ["mean", "median", "most_frequent"],
        "entrainement__hidden_layer_sizes": [(100,), (150,), (300,), (500,), (50, 50)],
        "entrainement__max_iter": [1000],
    }
)
gr.fit(X_tr, y_trr)
affichage(gr)

Paramètres gagnants:  {'entrainement__hidden_layer_sizes': (100,), 'entrainement__max_iter': 1000, 'imputation__strategy': 'most_frequent'}
Classement pour confirmation:  1
Score:  0.701194732475153
Déviation standard:  0.036041026836649094
CPU times: total: 28.5 s
Wall time: 4.75 s


# Conclusion

On aurait sans doute dûe faire un scaling sur le dataframe de la colonne des prix.
L'ordre de grandeur entre les entée et sorties étant beaucoup trop important.

In [62]:
knr = Pipeline(
    [
        ("imputation", SimpleImputer(strategy="median")),
        ("entrainement", KNeighborsRegressor(n_neighbors=12)),
    ]
)
knr.fit(X_tr, y_tr)


In [63]:
print(knr.score(X_tr, y_tr))

0.7857876579289715


In [64]:
print(knr.score(X_te, y_te))

0.44590981537599805


Malgré la cross validation on a une grosse déviation entre les deux scores.
Le prédicteur est en surapprentissage.

Il faut repartir du début, ou pourrait ajuster la taille respective des datasets d'entrainement et de test.

On pourrait aussi essayer d'ajouter une nouvelle variable explicative `quartier`, en utilisant les colonnes `desc` et `lien`.