# Récapitulatif séance précédente

In [1]:
import polars as pl
import re
import json
from requests import get
from typing import TypedDict


In [113]:
from itertools import islice

In [27]:
from rich import print
from rich.table import Table

In [2]:
requete = get("https://raw.githubusercontent.com/VPerrollaz/immobilier/refs/heads/master/donnees/brute.json")

In [3]:
annonces = list()
for ligne in requete.text.splitlines():
    annonces.append(json.loads(ligne))

In [4]:
brut = pl.DataFrame(annonces)
brut.describe()

statistic,id,genre,prix,pcs,desc,lien
str,str,str,str,str,str,str
"""count""","""1818""","""1818""","""1818""","""1818""","""1818""","""1818"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0"""
"""mean""",,,,,,
"""std""",,,,,,
"""min""","""annonce-100241563-317111""","""Appartement""","""1 030 000 €""","""""","""""TOURS GRAMMONT"" Dernier étage…","""https://www.bellesdemeures.com…"
"""25%""",,,,,,
"""50%""",,,,,,
"""75%""",,,,,,
"""max""","""annonce-98789327-311359""","""Terrain""","""€""","""92 m²""","""ÉLÉGANTE MAISON BOURGEOISE, po…","""https://www.selogerneuf.com/an…"


In [15]:
genre_valide = {"Appartement", "Appartement neuf", "Maison / Villa", "Maison / Villa neuve"}
motif_prix = "([0-9 ]+) €"
motif_surface = "(([0-9]+)|([0-9]+,[0-9]+)) m²"
motif_pieces = "([0-9]+) p"

In [47]:
filtre = (
    brut
        .unique()
        .drop("id")
        .filter(pl.col("genre").is_in(genre_valide))
        .filter(pl.col("prix").str.find(motif_prix).is_not_null())
        .filter(pl.col("pcs").str.find(motif_surface).is_not_null())
        .filter(pl.col("pcs").str.find(motif_pieces).is_not_null())
)
final = filtre.select(
            pl.col("prix").str.extract(motif_prix).str.replace_all(" ", "").cast(int),
            pl.col("genre").str.find("Maison").is_not_null().alias("maison"),
            pl.col("genre").str.find("neuf|neuve").is_not_null().alias("neuf"),
            pl.col("pcs").str.extract(motif_pieces).cast(int).alias("nb_pieces"),
            pl.col("pcs").str.extract(motif_surface).str.replace(",", ".").cast(float).alias("surface"),
        )


# Nouvelle séance

**EXERCICE** Explorer le contenu des variables `desc` et `lien` pour voir si on peut extraire des variables explicatives.

In [48]:
echantillon = filtre.sample(15).select("desc", "lien")

In [49]:
echantillon

desc,lien
str,str
"""IAD France - Jérôme CORTABITAR…","""https://www.seloger.com/annonc…"
"""Magnifique appartement au 3ème…","""https://www.seloger.com/annonc…"
"""Tours Giraudeau: maison de vil…","""https://www.seloger.com/annonc…"
"""TOURS NORD - ST SYMPHORIEN, Ré…","""https://www.seloger.com/annonc…"
"""Tours Sud, à 2 pas des granges…","""https://www.seloger.com/annonc…"
…,…
"""IAD France - Catherine ROGER v…","""https://www.seloger.com/annonc…"
"""Les appartements neufs propose…","""https://www.selogerneuf.com/an…"
"""Tours nord proche toutes commo…","""https://www.seloger.com/annonc…"
"""Super appartement de 73,16 M² …","""https://www.seloger.com/annonc…"


In [50]:
t = Table(*echantillon.columns)


In [51]:
for row in echantillon.rows():
    t.add_row(*row)

In [52]:
print(t)

In [53]:
for row in echantillon.select("lien").rows():
    print( row)

## Utilisation de la variable lien

In [44]:
echantillon.select(
    pl
        .col("lien")
        .str.replace("https://www.seloger.com/annonces/", "")
        .str.split("/")
).select(
    pl.col('lien').list.get(0).alias("zero"),
    pl.col('lien').list.get(1).alias("un"),
)

zero,un
str,str
"""achat""","""appartement"""
"""achat""","""appartement"""
"""investissement""","""appartement"""
"""achat""","""appartement"""
"""achat""","""appartement"""
…,…
"""achat""","""maison"""
"""achat""","""appartement"""
"""achat""","""divers"""
"""achat""","""appartement"""


In [71]:
exploration = filtre.select(
    pl
        .col("lien")
        .str.replace("https://www.seloger(neuf)?.com/", "")
        .str.split("/")
).select(
    pl.col('lien').list.get(0).alias("zero"),
    pl.col('lien').list.get(1).alias("un"),
    pl.col('lien').list.get(2).alias("deux"),
)

In [72]:
exploration.group_by("zero").len()

zero,len
str,u32
"""annonces""",1578
"""https:""",47


In [75]:
(filtre
 .filter(pl.col("lien").str.starts_with("https://www.seloger.com/").not_())
 .filter(pl.col("lien").str.starts_with("https://www.selogerneuf.com/").not_())
 .filter(pl.col("lien").str.starts_with("https://www.bellesdemeures.com/").not_())
)

genre,prix,pcs,desc,lien
str,str,str,str,str


In [77]:
# final
filtre.select(
            pl.col("prix").str.extract(motif_prix).str.replace_all(" ", "").cast(int),
            pl.col("genre").str.find("Maison").is_not_null().alias("maison"),
            pl.col("genre").str.find("neuf|neuve").is_not_null().alias("neuf"),
            pl.col("pcs").str.extract(motif_pieces).cast(int).alias("nb_pieces"),
            pl.col("pcs").str.extract(motif_surface).str.replace(",", ".").cast(float).alias("surface"),
            pl.col("lien").str.starts_with("https://www.seloger.com/").alias("seloger"),
            pl.col("lien").str.starts_with("https://www.selogerneuf.com/").alias("selogerneuf"),
            pl.col("lien").str.starts_with("https://www.bellesdemeures.com/").alias("bellesdemeures"),
    )

prix,maison,neuf,nb_pieces,surface,seloger,selogerneuf,bellesdemeures
i64,bool,bool,i64,f64,bool,bool,bool
243800,false,false,5,91.0,true,false,false
129000,false,false,2,41.0,true,false,false
119900,false,false,2,49.0,true,false,false
99900,false,false,4,72.0,true,false,false
113500,false,false,1,45.0,true,false,false
…,…,…,…,…,…,…,…
67500,false,false,1,30.0,true,false,false
143000,false,true,1,31.0,false,true,false
181900,false,false,3,64.0,true,false,false
170000,false,false,3,83.0,true,false,false


In [88]:
exploration = filtre.select(
    pl
        .col("lien")
        .str.replace("https://www.((seloger)|(selogerneuf)|(bellesdemeures)).com/annonces/", "")
        .str.split("/")
).select(
    pl.col('lien').list.get(0).alias("zero"),
    pl.col('lien').list.get(1).alias("un"),
    pl.col("lien").list.get(2).alias("deux"),
    pl.col("lien").list.get(3).alias("trois"),
)

In [81]:
exploration.group_by("zero").len()

zero,len
str,u32
"""achat-de-prestige""",156
"""investissement""",231
"""achat""",1238


In [84]:
# final
filtre.select(
            pl.col("prix").str.extract(motif_prix).str.replace_all(" ", "").cast(int),
            pl.col("genre").str.find("Maison").is_not_null().alias("maison"),
            pl.col("genre").str.find("neuf|neuve").is_not_null().alias("neuf"),
            pl.col("pcs").str.extract(motif_pieces).cast(int).alias("nb_pieces"),
            pl.col("pcs").str.extract(motif_surface).str.replace(",", ".").cast(float).alias("surface"),
            pl.col("lien").str.starts_with("https://www.seloger.com/").alias("seloger"),
            pl.col("lien").str.starts_with("https://www.selogerneuf.com/").alias("selogerneuf"),
            pl.col("lien").str.starts_with("https://www.bellesdemeures.com/").alias("bellesdemeures"),
            (
                (pl.col("lien")
                .str.replace("https://www.((seloger)|(selogerneuf)|(bellesdemeures)).com/annonces/", "")
                .str.split("/")
                .list.first()
                == "achat-de-prestige")
                .alias("prestige")
            ),
            (
                (pl.col("lien")
                .str.replace("https://www.((seloger)|(selogerneuf)|(bellesdemeures)).com/annonces/", "")
                .str.split("/")
                .list.first()
                == "investissement")
                .alias("investissement")
            ),
            (
                (pl.col("lien")
                .str.replace("https://www.((seloger)|(selogerneuf)|(bellesdemeures)).com/annonces/", "")
                .str.split("/")
                .list.first()
                == "achat")
                .alias("achat")
            ),
    
    )

prix,maison,neuf,nb_pieces,surface,seloger,selogerneuf,bellesdemeures,prestige,investissement,achat
i64,bool,bool,i64,f64,bool,bool,bool,bool,bool,bool
243800,false,false,5,91.0,true,false,false,false,false,true
129000,false,false,2,41.0,true,false,false,false,false,true
119900,false,false,2,49.0,true,false,false,false,false,true
99900,false,false,4,72.0,true,false,false,false,false,true
113500,false,false,1,45.0,true,false,false,false,true,false
…,…,…,…,…,…,…,…,…,…,…
67500,false,false,1,30.0,true,false,false,false,false,true
143000,false,true,1,31.0,false,true,false,false,true,false
181900,false,false,3,64.0,true,false,false,false,false,true
170000,false,false,3,83.0,true,false,false,false,false,true


In [85]:
exploration.group_by("un").len()

un,len
str,u32
"""maison""",391
"""appartement-luxe""",2
"""appartement""",1232


In [87]:
exploration.group_by("deux").len()

deux,len
str,u32
"""tours-37""",1625


In [89]:
exploration.group_by("trois").len()

trois,len
str,u32
"""133304129.htm""",1
"""141370111.htm""",1
"""140091979.htm""",1
"""122475421.htm""",1
"""134929917""",1
…,…
"""134639223""",1
"""127445727.htm""",1
"""141494957.htm""",1
"""140526405.htm""",1


In [90]:
exploration["trois"].unique()

trois
str
"""140952785.htm"""
"""122475421.htm"""
"""139136509.htm"""
"""136155203.htm"""
"""126864691.htm"""
…
"""130217761.htm?ci=370261&idqfix…"
"""138526881.htm"""
"""141465057.htm"""
"""122413077.htm"""


On pourrait essayer de filtrer ceux qui ne contiennent pas htm

## Exploitation de la variable desc

In [91]:
descriptions = filtre["desc"]

In [92]:
type(descriptions)

polars.series.series.Series

In [96]:
for desc in descriptions.sample(10):
    print(desc)

**REMARQUE** une première approche consiste à identifier visuellement des mots/expressions clefs.
Puis de faire une variable explicative, suivant que la description la contient ou pas.
exemple tram/proximité tram, gare/proximité gare, quartier spécifique...

**EXERCICE** déterminer 5 mots clefs que vous voulez employer.

In [119]:
filtre.select(
    pl.col("desc").str.to_lowercase().str.contains("tram"),
).group_by("desc").len()

desc,len
bool,u32
True,203
False,1422


In [122]:
# final
final = filtre.select(
            pl.col("prix").str.extract(motif_prix).str.replace_all(" ", "").cast(int),
            pl.col("genre").str.find("Maison").is_not_null().alias("maison"),
            pl.col("genre").str.find("neuf|neuve").is_not_null().alias("neuf"),
            pl.col("pcs").str.extract(motif_pieces).cast(int).alias("nb_pieces"),
            pl.col("pcs").str.extract(motif_surface).str.replace(",", ".").cast(float).alias("surface"),
            pl.col("lien").str.starts_with("https://www.seloger.com/").alias("seloger"),
            pl.col("lien").str.starts_with("https://www.selogerneuf.com/").alias("selogerneuf"),
            pl.col("lien").str.starts_with("https://www.bellesdemeures.com/").alias("bellesdemeures"),
            (
                (pl.col("lien")
                .str.replace("https://www.((seloger)|(selogerneuf)|(bellesdemeures)).com/annonces/", "")
                .str.split("/")
                .list.first()
                == "achat-de-prestige")
                .alias("prestige")
            ),
            (
                (pl.col("lien")
                .str.replace("https://www.((seloger)|(selogerneuf)|(bellesdemeures)).com/annonces/", "")
                .str.split("/")
                .list.first()
                == "investissement")
                .alias("investissement")
            ),
            (
                (pl.col("lien")
                .str.replace("https://www.((seloger)|(selogerneuf)|(bellesdemeures)).com/annonces/", "")
                .str.split("/")
                .list.first()
                == "achat")
                .alias("achat")
            ),
            pl.col("desc").str.to_lowercase().str.contains("tram").alias("tram"),
    pl.col("desc").str.to_lowercase().str.contains("gare").alias("gare"),
    )

In [123]:
final.describe()

statistic,prix,maison,neuf,nb_pieces,surface,seloger,selogerneuf,bellesdemeures,prestige,investissement,achat,tram,gare
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1625.0,1625.0,1625.0,1625.0,1625.0,1625.0,1625.0,1625.0,1625.0,1625.0,1625.0,1625.0,1625.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",263295.091077,0.240615,0.134769,4.055385,98.230351,0.836923,0.134154,0.028923,0.096,0.142154,0.761846,0.124923,0.048
"""std""",268122.707066,,,2.443577,82.60425,,,,,,,,
"""min""",29800.0,0.0,0.0,1.0,11.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",121900.0,,,3.0,57.39,,,,,,,,
"""50%""",179747.0,,,3.0,73.23,,,,,,,,
"""75%""",297000.0,,,5.0,108.0,,,,,,,,
"""max""",5596080.0,1.0,1.0,25.0,1400.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


**EXERCICE** Créer le dataset puis sélectionner un modèle.

In [125]:
type(final["prix"])

polars.series.series.Series

In [127]:
X = final.select(pl.all().exclude("prix")).to_numpy()
X.shape

(1625, 12)

In [128]:
y = final["prix"].to_numpy()
y.shape

(1625,)

In [130]:
X.dtype

dtype('float64')

In [131]:
from sklearn.model_selection import train_test_split, cross_val_score

In [132]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y)

In [133]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [139]:
lr = LinearRegression()
cross_val_score(lr, X_tr, y_tr, cv=5)

array([0.28391175, 0.7766156 , 0.71650146, 0.76509436, 0.77343693])

In [138]:
rf = RandomForestRegressor()
cross_val_score(rf, X_tr, y_tr)

array([0.77691801, 0.63082172, 0.78317019, 0.76037752, 0.71965005])

In [140]:
gb = GradientBoostingRegressor()
cross_val_score(gb, X_tr, y_tr)

array([0.73425673, 0.63374028, 0.77855052, 0.71822454, 0.69740466])

In [142]:
for nb_voisins in range(2, 15):
    pred = KNeighborsRegressor(n_neighbors=nb_voisins)
    score = cross_val_score(pred, X_tr, y_tr)
    print(f"nb_voisins: {nb_voisins}, scores: {score}")

**REMARQUE** alternativement/de manière complémentaire, on peut concaténer toutes les descriptions puis faire une analyse lexicale de ce texte pour déterminer des mots importants revenant mais pas trop.
On a deux bibliothèques permettant de faire ce genre de manipulations: `nltk`, `spacy`.

En ce qui concerne `spacy`, on pourra regarder les 5 vidéos de Vincent Warmerdam sur youtube qui constituent une bonne introduction à la bibliothèque.

In [103]:
 text_desc = "\n".join(descriptions)

In [104]:
type(text_desc)

str

In [105]:
len(text_desc)

299629

In [106]:
import spacy

In [107]:
nlp = spacy.load("fr_core_news_sm")

In [108]:
doc = nlp(text_desc)

In [109]:
type(doc)

spacy.tokens.doc.Doc

In [111]:
docit = iter(doc)
token = next(docit)

In [114]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [115]:
for token in islice(doc, 10):
    print(token.text)

In [116]:
for ent in doc.ents:
    print(ent)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



A Suivre à la prochaine séance.