In [2]:
#manipulate/organize data/visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#preprocess data
import unicodedata as ud
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
#Linear regression to fill missing data for certain types
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import mean_absolute_error, r2_score

#### How to use ? :
    you need a scrape data file from seloger which has at least one col "title" that contains data of each property scraped from 
    titles. use files - data_sample_{num_rows}.csv files , num_rows indicate size of the dataset. at the time of writing,
    we have 2 files one contain ~1600 properties and one ~4743 (include both residential and commercial), we will select only residential in this code.

    to use, you simple get the dataframe of the imported csv file or give the function filepath.
    ex : 
        vectorize_dataset_seloger(df=merged_df) // this is for pandas.DataFrame input
        vectorize_dataset_seloger(csv_file = "folder/fname") // this is for direct import
    if you want to save the processed df :
        vectorize_dataset_seloger(df=merged_df, save = True)
    

In [3]:
"""
    the first dataset (about 1500 properties) i scraped only contain these locations. 
    (i carefully extracted all preprocessed tokens and collect all distinct locations to make this list,
    i added some exception for some locations because my preprocessing code don't reduce them exactly like some other do.
    Although it's not me who assigned them the postcodes.
"""
STEM_TO_POSTCODES = {
    "BOULOGN": "92000",                       # Boulogne-Billancourt
    "BILLANCOURT": "92000",                   # Boulogne-Billancourt
    "BOULOGN BILLANCOURT": "92000",           # Boulogne-Billancourt
    "NEUILLY": "92000",                       # Neuilly-sur-Seine
    "NEUILLY SEIN": "92000",                  # Neuilly-sur-Seine
    "LEVALLOIS PERRET": "92000",              # Levallois-Perret
    "CLICHY": "92000",                        # Clichy
    "CLICHY GAREN": "92000",                  # Clichy-la-Garenne (same CP)
    "SAINT CLOUD": "92000",                   # Saint-Cloud
    "CLOUD": "92000",                         # Saint-Cloud
    "PUTEAU": "92000",                        # Puteaux
    "SURESN": "92000",                        # Suresnes
    "ISSY": "92000",                          # Issy-les-Moulineaux
    "ISSY MOULINEAU": "92000",                # Issy-les-Moulineaux
    "MOULINEAU": "92000",                     # Issy-les-Moulineaux
    "MONTROUG": "92000",                      # Montrouge
    "LE LIL": "93000",                        # Les Lilas
    "AUBERVILLI": "93000",                    # Aubervilliers
    "SAINT OUEN SEIN": "93000",               # Saint-Ouen-sur-Seine
    "OUEN": "93000",                          # Saint-Ouen-sur-Seine
    "CHARENTON": "94000",                     # Charenton-le-Pont
    "CHARENTON PONT" : "94000",
    "VANV": "92000",                          # Vanves
    "MONTREUIL": "93000",                     # Montreuil
    "PANTIN": "93000",                        # Pantin
    "PONT": "94000",                          # (mapped by you to Charenton-le-Pont)
    "SEIN": "92000",                          # (mapped by you to Neuilly-sur-Seine)
    "SAINT DEN" : "93000",
    "DEN": "93000",                           # Saint-Denis
    "PARIS": "75000",
    "BAGNOLET": "93000"
}
#decide which tokens are valid, other tokens were not carefully analyzed by humans therefore would likely to produce errors, and inaccuracy.
valid_tokens = {'APPART A VENDR',
     'AUBERVILLI',
     'BAGNOLET',
     'BOULOGN BILLANCOURT',
     'CHAMBR',
     'CHARENTON PONT',
     'CLICHY',
     'CLICHY GAREN',
     'DISPONIBL MAINTEN',
     'DIVISIBL',
     'DUPLEX A VENDR',
     'ETAG',
     'HOTEL PARTICULI A VENDR',
     'ISSY MOULINEAU',
     'LE LIL',
     'LEVALLOIS PERRET',
     'LOFT A VENDR',
     'MAISON A VENDR',
     'MAISON VILL A VENDR',
     'MONTREUIL',
     'MONTROUG',
     'M²',
     'NEUF',
     'NEUILLY SEIN',
     'PANTIN',
     'PARIS',
     'PIEC',
     'PUTEAU',
     'RDC',
     'SAINT CLOUD',
     'SAINT DEN',
     'SAINT OUEN SEIN',
     'STUDIO A VENDR',
     'SURESN',
     'TERRAIN',
     'TERRAIN CONSTRUCTIBL A VENDR',
     'VANV',
     'VILL A VENDR'}
"""
     We decide to only analyze residential properties and remove all commercial properties,
     later, we will drop all type (categorical data, include postcodes) that do not meet necessary threshold number of datapoint (property).
"""
valid_types = {
    'APPART A VENDR',
    'DUPLEX A VENDR',
    'LOFT A VENDR',
    'MAISON A VENDR',
    'MAISON VILL A VENDR',
    'STUDIO A VENDR',
    'TERRAIN CONSTRUCTIBL A VENDR',
    'VILL A VENDR'
}

#regrex patterns to identify and extract or to replace patterns in string.
# Currency tokens (symbol or word forms)

EUR_PATTERN = re.compile(
    r"(€|\b(?:eur|euro(?:s)?)\b)", re.IGNORECASE
)
# ✅ Matches: "€", "EUR", "euro", "euros", "Prix: 200 eur"
# ❌ Doesn’t: "eurostar" (word boundary blocks it), "EUROPE" (not 'euro' exactly)
# ⚠️ Note: Won’t catch "€1,200" as a whole—this only finds the currency token, not the number.

GBP_PATTERN = re.compile(
    r"(£|\b(?:gbp|pound(?:s)?)\b)", re.IGNORECASE
)
# ✅ Matches: "£", "GBP", "pound", "pounds", "200 gbp"
# ❌ Doesn’t: "pounding", "compound" (word boundary prevents false positives)
# ⚠️ "$" in "CA$" is not relevant here; that’s for USD_PATTERN.

USD_PATTERN = re.compile(
    r"(\$|\b(?:usd|dollar(?:s)?)\b)", re.IGNORECASE
)
# ✅ Matches: "$", "USD", "dollar", "dollars", "2 usd"
# ❌ Doesn’t: "sandollar" (needs word boundary), other currency symbols like "C$", "A$"
# ⚠️ "$" is ambiguous across countries; consider context if you need only US dollars.

# General number with optional decimal (dot or comma)
NUM_PATTERN = re.compile(r"\d+(?:[.,]\d+)?")
# ✅ Matches: "5", "12", "83.5", "1,75", "0003"
# ❌ Doesn’t: signed numbers like "-3.2" (see PATTERN_NUM), fractions like "1/7" (you said slash handled separately)
# ⚠️ Will also match date parts ("12/10") individually.

# French ordinal like "1ER", "12EM"
ORDINAL_PATTERN = re.compile(r"\b\d+(?:ER|EM)\b", flags=re.IGNORECASE)
# ✅ Matches: "1ER", "2er", "12EM", "3em"
# ❌ Doesn’t: "1ÈRE", "2ÈME" (accented forms), "1st", "2nd"
# ⚠️ If you need accented French ordinals, expand to include ÈRE/ÈME variants.

# Exact "PARIS" (uppercase only)
PARIS_PATTERN = re.compile(r"PARIS")
# ✅ Matches: "PARIS"
# ❌ Doesn’t: "Paris", "paris" (case-sensitive), arrondissements like "75015 Paris" unless uppercased
# 💡 Consider re.compile(r"\bparis\b", re.I) if you want case-insensitive.

# Signed integer/float with dot decimal
PATTERN_NUM = re.compile(r"[+-]?\d+(?:\.\d+)?")
# ✅ Matches: "-12", "+3.5", "0.99"
# ❌ Doesn’t: comma decimals ("1,75"), fractions ("1/7")
# ⚠️ This one is stricter than NUM_PATTERN (only dot decimals, allows sign).

# Area cues: "m²", "m2", "surface…", or "terrain"
PATTERN_AREA = re.compile(r"(?:[-–—]?\s*)(?:M(?:²|2)|SURFAC\w*|TERRAIN)", re.I)
# ✅ Matches: "m²", "M2", "surface", "surfaces", "terrain", with optional preceding dash/space
# ❌ Doesn’t: "m^2" (caret), "area" (English), "superficie" (if that term appears, it’s missed)
# ⚠️ "TERRAIN" here might be too broad if you treat lots vs. living area differently.

# Standalone "TERRAIN"
PATTERN_TERRAIN = re.compile(r"\bTERRAIN\b", re.I)
# ✅ Matches only the word "terrain" as a token
# ❌ Doesn’t: "terrainment", "souterrain" (good—no false positives)

# Any digit anywhere
PATTERN_HAS_DIGIT = re.compile(r"\d")
# ✅ Matches if string contains at least one digit
# ❌ Doesn’t: strings with numbers written as words ("trois", "three")

# Availability "disponible maintenant" with flexible endings
PATTERN_DISPO_NOW = re.compile(r"\bDISPONIBL\w*\s+MAINTEN\w*\b", re.I)
# ✅ Matches: "disponible maintenant", "Disponibilité maintenue" (careful), "disponibles maintien…"
# ❌ Doesn’t: rearranged word order ("maintenant disponible"), abbreviations
# ⚠️ Broad \w* can overmatch ("maintenue"); tighten if needed to specifically "disponible maintenant".

# Floor indicators: "étage" or "RDC" (ground floor)
PATTERN_FLOOR = re.compile(r"\b(?:ETAG\w*|RDC)\b", re.I)
# ✅ Matches: "etage", "étages", "RDC"
# ❌ Doesn’t: "Rez-de-chaussée" written fully (unless you rely on "RDC")
# ⚠️ You also have dedicated ETAG/RDC patterns below; avoid double work.
    
PATTERN_ETAG = re.compile(r"\bETAG\w*\b", re.I)
# ✅ Matches: "etage", "étages", "étagé" (could be false positive)
# ❌ Doesn’t: "Rez-de-chaussée", "R+1" (common shorthand not covered)

PATTERN_RDC = re.compile(r"\bRDC\w*\b", re.I)
# ✅ Matches: "RDC", "RDC+1" (because \w*), "RDC."
# ❌ Doesn’t: "Rez de chaussée" written without "RDC"
# ⚠️ If you don’t want "RDC+1", use r"\bRDC\b".

# Rooms / pieces
PATTERN_PIECE = re.compile(r"\bPIEC\w*\b", re.I)
# ✅ Matches: "piece", "pièces", "pièce(s)"
# ❌ Doesn’t: abbreviations like "T2", "F3" (common in FR real-estate)

# New / newly built
PATTERN_NEUF = re.compile(r"\bNEUF\w*\b", re.I)
# ✅ Matches: "neuf", "neuve", "neufs", "neuves"
# ❌ Doesn’t: "neuf" meaning the number 9 when used numerically (context needed)
# ⚠️ Ambiguous: "neuf" can be adjective "new" or the noun/adjective "nine".

# Bedrooms
PATTERN_CHAMBRE = re.compile(r"\bCHAMBR\w*\b", re.I)
# ✅ Matches: "chambre", "chambres"
# ❌ Doesn’t: abbreviations ("chb", "CH."), "suite parentale" (no keyword "chambre")

# Number BEFORE a slash (capture group = the number)
PATTERN_BEFORE_SLASH = re.compile(r"([+-]?\d+(?:\.\d+)?)\s*/")
# ✅ Matches the "12" in "12 / 5", "+3.5/7"
# ❌ Doesn’t: "12,5/7" (comma decimals), no slash present
# ⚠️ Will also match dates ("12/10")—disambiguate upstream if needed.

# Number AFTER a slash (capture group = the number)
PATTERN_AFTER_SLASH = re.compile(r"/\s*([+-]?\d+(?:\.\d+)?)")
# ✅ Matches the "5" in "12 / 5", the "7" in "3.5/7"
# ❌ Doesn’t: "12/1,5" (comma decimals)

# Literal slash (useful for quick checks/splits)
PATTERN_SLASH = re.compile(r"/")
# ✅ Matches any "/" character
# ❌ Doesn’t: division written as "÷" or "per" words


In [4]:
def remove_accents(text: str)->str:
    """
    this function serve to remove accents and diacritics of french alphabet.
    input string
    output string
    """
    #replace diacritics
    text = (text.replace("œ", "oe").replace("Œ", "OE")
           .replace("æ", "ae").replace("Æ", "AE"))
    #decompose form 'è' -> 'e' + '`'
    text = ud.normalize("NFD",text)
    #remove accents like '`','^',...
    text = "".join([char for char in text if ud.category(char) != "Mn"])
    #recompose 
    text = ud.normalize("NFC",text)
    return str(text)

In [5]:
def preprocess_french_nltk(text: str)->str:
    """
    this function serve to normalize tokens, preprocess the texts to better detect patterns,repetition of tokens.
    input string
    output string
    """
    #get text that doesn't have accents.
    text = remove_accents(text)
    stemmer = SnowballStemmer("french")#prepare stemming process for french.
    stop_fr = set(stopwords.words("french"))#prepare stopwords removal for french (stopwords are à de la le etc..)
    #extract tokens from text under french alphabet condition and m2/m^2 for extracting area value
    tkns = re.findall(r"[A-Za-zÀ-ÖØ-öø-ÿ\d/(m2|m²).]+", text)
    tkns = [t for t in tkns if t not in stop_fr]#adding to a list if token not in stopwords (to remove stopwords we defined earlier)
    stems  = [stemmer.stem(t) for t in tkns]#stem, normalize words, for ex in english GO and GOES all normalized to GO (i dnt have ex in fr)
    return (" ".join(stems)).upper()
preprocess_french_nltk("test : go goes")

'TEST GO GO'

In [6]:
"""
- after brief visual inspection of the scraped data, we can find some patterns like, 
pieces of informations are separated by "-" and ",".
- but we have some "," that belongs to decimal numbers (e.g. 1,2 5,9 3,14 etc).
- let's find the distinction between between seperation by "," and "," for decimals,
we concluded that "," for decimal stick with numbers and "," to separate pieces of information is between 2 spaces.
- let's replace "," for decimal by a ".".
"""
def extract_tokens(text : str = ""):
    #convert "," of float num to "."
    text = re.sub(r"(?<=\d),(?=\d)",".",text)
    #convert "-" to " "
    text = re.sub(r'(?<=[^\W\d_])-(?=[^\W\d_])', ' ', text, flags=re.UNICODE)
    tkns = [tkn.strip() for tkn in re.split("[-,]",text)]
    return tkns

In [7]:
def convert_2price(price_str : str = None):
    """
        to extract price,
        price is in format like :
        1234(some unicodedata)5678(some unicodedata)90
        - we are just gonna get the digit and ignore the *special char with escape sequence*, 
        because big price don't need decimal numbers, it's useless (probably).
    """
    try :
        #get the point pattern (remember we convert all "," to "." if it's a decimal point)
        where_is_the_point = re.compile(r"\.")
        #get matches
        matches = list(where_is_the_point.finditer(price_str))
        #initalize stop index (default'd be end of the input string)
        stop_index = len(price_str)
        #if theres a match, reassign stop index to the start index of the "."
        if matches :
            for match in matches :
                stop_index = match.start()
                break
        price_str = "".join([chr for chr in price_str[:stop_index] if chr.isdigit()])#search only from 0 to the stop index
        return float(price_str)
    except Exception :
        return None
#testing.
print(convert_2price("1528.15"))
print(convert_2price(r"156 4866"))

1528.0
1564866.0


In [8]:
def extract_price(tkns : list = None):
    """
    -function to extract price token.
    -input tokens : list (but actually it works both for tuple,set too but i only allow list here.)
    -output list of [[*tokens with out price token*],*price token*]
    """
    if not tkns :
        raise ValueError("elements = None")
    #create patterns.
    EUR_PATTERN = re.compile(r"(€|\b(?:eur|euro(?:s)?)\b)", re.IGNORECASE)
    GBP_PATTERN = re.compile(r"(£|\b(?:gbp|pound(?:s)?)\b)", re.IGNORECASE)
    USD_PATTERN = re.compile(r"(\$|\b(?:usd|dollar(?:s)?)\b)", re.IGNORECASE)
    #m_tkns for the tokens without price token, prices are price we find.
    m_tkns = []
    prices = []
    #loop throught tokens to categorize.
    for tkn in tkns :
        if bool(EUR_PATTERN.search(tkn)):
            prices.append([tkn,"EUR"])
        elif bool(GBP_PATTERN.search(tkn)):
            prices.append([tkn,"GBP"])
        elif bool(USD_PATTERN.search(tkn)):
            prices.append([tkn,"USD"])
        else :
            m_tkns.append(tkn)
    if len(prices) != 1:
        raise ValueError("More than 1 or no price for this property")
    return [m_tkns,prices[0]]

In [16]:
def get_processed(raw_df):
    """
    just a function to apply all the functions i created above.
    input df (pd.DataFrame or pandas.DataFrame depend how you assign alias)
    output df with cols (names i mention in return line)
    """
    list_tags = []
    prices =[]
    price_units = []
    raws = []
    for i in range(len(raw_df)):
        tags,(price_str,price_unit)=extract_price(extract_tokens(raw_df.iloc[i]))
        tags = list(map(preprocess_french_nltk,tags))
        price = convert_2price(price_str)
        if price == None :
            print(price_str)
        if price > 1000 :# add threshold to prevent false pricing, or maybe i just don't like extreme little houses/appartments
            list_tags.append(tags)
            prices.append(price)
            price_units.append(price_unit)
            raws.append(raw_df.iloc[i])
    return (pd.DataFrame({"RAW" : tuple(raws),"DATA TAG":list_tags,"PRICE":prices,"PRICE_UNIT":price_units}))

In [18]:
def normalize_token(tag):
    """
    function to normalize tokens, well, in short, i just don't like meaningless variations like "12eme PARIS" or "15eme PARIS",
    they are all "PARIS", so lets consider them this way, and don't worry, im not erasing the info of it's district,
    just putting all these normalized tokens in a different column so it would be easier to find and to list and to analyze and to inspect.
    but i suppose i can achieve the same result with regrex, but i find it much much simpler this way.
    - input string
    - output string, or None-(isn't an efficient way but we learn along the way)
    """
    tag = tag.upper().strip()
    if "ETAG" in tag :
        return "ETAG"
    if 'RDC' in tag:    
        return 'RDC'
    """
    if 'TERRAIN' in tag:
        return 'TERRAIN'
    """
    if 'SURFAC' in tag:
        return 'SURFAC'
    #remove tags
    if 'DIVISIBL A PART M²' in tag or 'DIVISIBL JUSQU A M²' in tag or 'DIVISIBL M² A M²' in tag:
        return None
    #reduce all M² to M²
    if 'M²' in tag :
        return 'M²'

    # Remove ordinals like 1ER / 12EM and plain numbers/decimals
    tag = ORDINAL_PATTERN.sub('', tag)
    tag = tag.replace('/', ' ')            # split fractions like "1/7"
    tag = NUM_PATTERN.sub('', tag)

    # Collapse whitespace
    tag = re.sub(r'\s+', ' ', tag).strip()

    return tag

In [20]:
#apply the normalize to tags (PLURAL)
def normalize_tags(tags):
    n_tkns = [] #normalized tokens
    for tkn in tags :
        if str(tkn).strip() :
            n_tkn = normalize_token(tkn)
            if n_tkn:
                if n_tkn in valid_tokens :#verify if it's valid tokens.
                    n_tkns.append(n_tkn)
                else :
                    return pd.NA
    if len(n_tkns) == 0 :
        return None
    return tuple(n_tkns)

In [22]:
def clean_tags(seqs):
    return {normalize_tags(tags) for tags in seqs }

In [24]:
def extract_type(cleaned_tags):
    """
    to extract type of property ("APPART A VENDR" or "MAISON A VENDR" , etc)
    """
    for t in cleaned_tags :
        if t in valid_types : # only return allowed types
            return t
    return pd.NA#to drop them later.

In [26]:
def get_paris_cp(tags):
    """
    get paris codepostal.
    it's designed for paris because, other districts don't matter. and we won't have that much data to futher divide categories.
    - possible input list,tuple,set of tokens, also sometime tags can be referred to sequences (seqs).
    - output : string (e.g. "75015",...)
    """
    paris_tag = ""
    for tag in tags :
        if "PARIS" in tag :
            if not bool(re.search(r"\d", tag))  :#if find no digit
                print("Got tag with no digit : ", tag)
                return None
            try :
                digits = int("".join([ch for ch in tag if ch.isdigit()]))
                if digits <= 20 and digits > 0:
                    return str(75000 + int(digits))
                else :
                    print("District code doesn't exist")
                    return None
            except Exception :
                print(f"Error encountered (got digits = {digits}): ", tag)
                return None
    return None

In [28]:
#now let's extract all locations to a column
def extract_loc(df,dict_map):
    """
    extract_loc is gonna focus on inside paris, and others will be more generalized.
    - inputs :
        + df -> pd.DataFrame
        + dict_map : a dict that contains locations mapped with codepostals.
    - output : dataframe with loc col (extracted codepostals in string type)
    """
    df= df.copy()
    mask_not_paris = df["CLEANED TAG"].apply(lambda x: isinstance(x, (list, tuple, set)) and "PARIS" not in x)
    valid_locs = set(dict_map.keys())
    df.loc[mask_not_paris, "LOC"] = (
        df.loc[mask_not_paris, "CLEANED TAG"]
          .apply(lambda x: (set(x) & valid_locs) if x else pd.NA)
          .apply(lambda x: {dict_map[c] for c in x} if x else pd.NA)
    )
    df.loc[~mask_not_paris,"LOC"] = (
        df[~mask_not_paris]["DATA TAG"].apply(lambda x : set([get_paris_cp(x)]) if get_paris_cp(x) else pd.NA)
    )
    mask_multi_loc = df["LOC"].apply(lambda x: isinstance(x, (list, tuple, set)) and len(x) > 1)
    #print("Multi loc assigned detected : ",(mask_multi_loc & mask_not_paris).sum())
    return df

In [30]:
def _numbers_in_text(s):
    """Return all numbers (as floats) found in a string. Spaces already preprocessed upstream."""
    if not isinstance(s, str):
        return []
    s_clean = s.replace(" ", "")  # e.g. '2 645' -> '2645'
    return [float(m.group(0)) for m in PATTERN_NUM.finditer(s_clean)]

In [32]:
"""
Now let's extract area, we know that area can be represented in 3 tags/tokens : 
-M²
-SURFAC
-TERRAIN
"""
def extract_area(df):
    """
    Normalize surface data into two scalars per row:
      - AREA: one scalar for all non-TERRAIN surfaces (or TERRAIN tokens without digits)
      - TERRAIN: one scalar for terrain (only if the token contains 'TERRAIN' + a digit)
      - some special case will be handled (if there are), case where a token/tag contains "DIVISIBL", would often contains more than 1 area value,
      we decide to take the min area for the indicated price (that's often how marketing do)
    Inputs:
      df: DataFrame
    Returns:
      new df with cols AREA,TERRAIN, both in float type.
    """
    #create a set for tokens/tags that we area interested in extracting.
    area_tags = {"M²","SURFAC","TERRAIN"}
    #create a mask for token/tag that contains patterns in area_tags.
    has_area = df["CLEANED TAG"].apply(
        lambda xs: bool(set(xs) & set(area_tags)) if isinstance(xs, (list, tuple, set)) else False
    )
    #remove all rows that don't contain any area_tags, those miss the most crucial feature in determining a property value.
    out = df.loc[has_area].copy()
    #function to extract all tokens that match tags M,M^2,SURFAC,TERRAIN patterns.
    def get_area_tokens(tokens):
        """
        input : tokens type list,tuple,set (we are extracting from DATA TAG so it's likely to be a list.)
        output : pd.NA if any or list of matched tokens.
        """
        #just safety condition if tokens is type list,tuple,set
        if not isinstance(tokens, (list, tuple, set)):
            return pd.NA#assign them pd.NA
        #a list to store matched tokens
        hits = []
        for t in tokens:#loop token in tokens
            if re.search(PATTERN_AREA, str(t)):#if found
                hits.append(str(t))#convert the token to string and add to hits list
        if hits:
            return hits
        return pd.NA
    #put all extracted tokens in a col named "ALL AREA" represent both AREA and TERRAIN scalars, we will separate them later.
    out["ALL AREA"] = out["DATA TAG"].apply(
        lambda x: get_area_tokens(list(x)) if isinstance(x, (list, tuple, set)) else pd.NA
    )
    #function to get the lowest number found in a tag/token (designed for tag that contains multiple area values like "DIVISIBL")
    def get_lowest_value(tokens):
        if not tokens:
            return pd.NA
        vals = []
        for t in tokens:
            try:
                vals.extend(_numbers_in_text(t))#store 
            except Exception:
                pass#if don't find any number or incompatible token/function
        return min(vals) if vals else pd.NA
    def is_divisible(tokens):
        if not tokens:
            return pd.NA
        for t in tokens:
            if isinstance(t, str) and "DIVISIBL" in t:
                return True
        return False
    def split_area_vs_terrain(tokens):
        if not isinstance(tokens, (list, tuple, set)):
            return [], []
        area_tokens, terrain_tokens = [], []
        for t in tokens:
            t_str = str(t)
            has_terrain = bool(PATTERN_TERRAIN.search(t_str))#if tag/token has "TERRAIN" pattern.
            has_digit   = bool(PATTERN_HAS_DIGIT.search(t_str))#same for digit.
            if has_terrain and has_digit:
                terrain_tokens.append(t_str)#that's a scalar for TERRAIN.
            else:
                area_tokens.append(t_str)#that's a scalar for AREA.
        return area_tokens, terrain_tokens
    #compute scalars for AREA and TERRAIN from 'ALL AREA'
    def compute_scalars(tokens):
        if not isinstance(tokens, (list, tuple, set)) or not tokens:
            return pd.Series({"AREA": pd.NA, "TERRAIN": pd.NA})#in case theres nothing, we return with cols that contains pd.NA
        #get area,terrain tokens.
        area_tokens, terrain_tokens = split_area_vs_terrain(tokens)
        if area_tokens:#area tokens is considered all tokens without "TERRAIN" pattern.
            area_val = get_lowest_value(area_tokens)#in case have multiple area values in on area token ("DIVISIBL" case.)
        else:
            area_val = pd.NA
        terrain_val = get_lowest_value(terrain_tokens) if terrain_tokens else pd.NA
        return pd.Series({"AREA": area_val, "TERRAIN": terrain_val})
    scalars = out["ALL AREA"].apply(compute_scalars)
    out.loc[:, "AREA"] = scalars["AREA"]
    out.loc[:, "TERRAIN"] = scalars["TERRAIN"]
    return out

In [34]:
#bool tag detect functions
"""
these are function that's to detect if a sequences (CLEANED TAG) col or (DATA TAG) contain a certain token.
return true or false in digit (0,1).
"""
def is_dispo(tags):
    # DISPONIBL MAINTEN: 1 if present, 0 if explicitly absent, <NA> if no tokens
    if not isinstance(tags, (set, tuple, list)) or len(tags) == 0:
        return pd.NA
    return 1 if any(PATTERN_DISPO_NOW.search(str(t)) for t in tags) else 0

def is_neuf(tags):
    # NEUF: 1 if present, 0 if explicitly absent, <NA> if no tokens
    if not isinstance(tags, (set, tuple, list)) or len(tags) == 0:
        return pd.NA
    return 1 if any(PATTERN_NEUF.search(str(t)) for t in tags) else 0

def has_chambr(tags):
    # CHAMBR: 1 if present, <NA> otherwise (to be averaged later, per your design)
    if not isinstance(tags, (set, tuple, list)) or len(tags) == 0:
        return pd.NA
    return 1 if any(PATTERN_CHAMBRE.search(str(t)) for t in tags) else pd.NA

def has_piece(tags):
    # PIEC: 1 if present, <NA> otherwise (to be averaged later)
    if not isinstance(tags, (set, tuple, list)) or len(tags) == 0:
        return pd.NA
    return 1 if any(PATTERN_PIECE.search(str(t)) for t in tags) else pd.NA

def has_floor(tags):
    # ETAG/RDC: 1 if present, 0 if explicitly absent, <NA> if no tokens
    if not isinstance(tags, (set, tuple, list)) or len(tags) == 0:
        return pd.NA
    return 1 if any(PATTERN_FLOOR.search(str(t)) for t in tags) else 0

#bool for floor extraction
def is_rdc(tag):
    if not tag :
        raise ValueError("No tag provided")
    if bool(PATTERN_RDC.search(tag)):
        return 1
    else : return 0
def is_etag(tag):
    if not tag :
        raise ValueError("No tag provided")
    if bool(PATTERN_ETAG.search(tag)):
        return 1
    else : return 0    
def has_slash(tag):
    if not tag:
        raise ValueError("No tag provided")
    if bool(PATTERN_SLASH.search(tag)):
        return 1
    else : return 0

In [36]:
def extract_additionals(df,col_seq="CLEANED TAG",col_tag="DATA TAG"):
    """
    extract all additional features: number of piece, number of chamber, number of floors, and which floor located,
    is it new ? is it dispo mtn ?
    - for floor feature (contains "ETAG" or "RDC" patterns) :
        + have couple type of format like RDC/(digit) or (digit)/(digit) or RDC alone,
        + here is how we treat them:
            + RDC/(digit) -> floor = 0 and floors = (digit)
            + RDC alone -> floor = 0 and floors = 0
            + (digit)/(digit) -> floor = first digit before the slash and floors = second digit after the slash

    and as usual, input a dataframe (this time there are parameters to indicate which cols to look for)
    and output new dataframe with new cols for extracted features.
    """
    #create masks
    mask_is_dispo = df[col_seq].apply(lambda x : is_dispo(list(x)) if isinstance(x,(tuple,list,set)) else pd.NA).fillna(0).astype(bool)
    mask_is_neuf = df[col_seq].apply(lambda x : is_neuf(list(x)) if isinstance(x,(tuple,list,set)) else pd.NA).fillna(0).astype(bool)
    mask_has_chambr = df[col_seq].apply(lambda x : has_chambr(list(x)) if isinstance(x,(tuple,list,set)) else pd.NA).fillna(0).astype(bool)
    mask_has_piece = df[col_seq].apply(lambda x : has_piece(list(x)) if isinstance(x,(tuple,list,set)) else pd.NA).fillna(0).astype(bool)
    mask_has_floor = df[col_seq].apply(lambda x : has_floor(list(x)) if isinstance(x,(tuple,list,set)) else pd.NA).fillna(0).astype(bool)
    #extract tags
    def get_chambr_tag(tags):
        if not tags :
            raise ValueError("Tags not provided")
        if not isinstance(tags,(set,list,tuple)):
            return pd.NA
        for t in tags :
            if bool(PATTERN_CHAMBRE.search(t)):
                return t
        return pd.NA
    def get_piece_tag(tags):
        if not tags :
            raise ValueError("Tags not provided")
        if not isinstance(tags,(set,list,tuple)):
            return pd.NA
        for t in tags :
            if bool(PATTERN_PIECE.search(t)):
                return t
        return pd.NA
    def get_floor_tag(tags):
        if not tags :
            raise ValueError("Tags not provided")
        if not isinstance(tags,(set,list,tuple)):
            return pd.NA
        for t in tags :
            if bool(PATTERN_FLOOR.search(t)):
                return t
        return pd.NA
    #get numerical values
    def get_floor(tag):
        if not tag :
            raise ValueError("No tag provided")
        nb_floor = PATTERN_BEFORE_SLASH.search(tag)
        if bool(nb_floor):
            return int(float(nb_floor.group(1)))
        else : return 0
    def get_floors(tag):
        if not tag :
            raise ValueError("No tag provided")
        nb_floors = PATTERN_AFTER_SLASH.search(tag)
        if bool(nb_floors):
            return int(float(nb_floors.group(1)))
        else : return 0
    def get_num(tag):
        tag = str(tag)
        if not tag :
            raise ValueError("tag not provided")
        num = PATTERN_NUM.search(tag)
        if bool(num):
            return int(num.group(0))
        else :
            return 0
    #this function to split something like 5/7 to floor = 5 and floors = 7
    def split_floor_tag(tag):
        if not tag or not bool(PATTERN_FLOOR.search(tag)):#verify (safety check) if tag is provided and there must be floor patterns in tag
            #floor patterns is ETAG or RDC
            raise ValueError("tag not provided or no floor pat in tag")
        #we need to distinguish if taf is rdc or etag, cuz rdc don't have same format as etag
        is_rdc_flag = bool(PATTERN_RDC.search(tag))
        has_slash   = bool(PATTERN_SLASH.search(tag))
        #default is floor = 0 and floors = 0
        floor, floors = 0, 0
        if is_rdc_flag:
            floor = 0 #as mentioned.
        if has_slash: #get digit before the slash and after the slash
            nb_floor  = PATTERN_BEFORE_SLASH.search(tag)
            nb_floors = PATTERN_AFTER_SLASH.search(tag)
            if not is_rdc_flag and nb_floor: #case of RDC/(digit) "and nb_floor" mean if the code find number
                floor = int(float(nb_floor.group(1)))
            if nb_floors:
                floors = int(float(nb_floors.group(1)))
            if not nb_floors:
                floors = floor
        else:
            if not is_rdc_flag:
                floor = int(get_num(tag)) or 0
            floors = floor
    
        return (floor, floors)
    """
    these codes are to identify and extract numerical data.
    """
    #bool extraction
    series_dispo = df[col_seq].apply(lambda x : is_dispo(x))
    series_neuf = df[col_seq].apply(lambda x : is_neuf(x))
    #unconditional extraction & pd.NA to be averaged
    #chambre
    series_chambr = df[col_seq].apply(lambda x : has_chambr(x))
    series_chambr.loc[series_chambr.notna()] = df.loc[mask_has_chambr,col_tag].apply(lambda x : get_chambr_tag(x))
    series_chambr.loc[series_chambr.notna()] = series_chambr.loc[series_chambr.notna()].apply(lambda x : int(get_num(x)) if x else pd.NA)
    is_valid_series_chambr = series_chambr.dropna().apply(lambda x : isinstance(x,int)).all()
    if not is_valid_series_chambr :
        raise ValueError("is_valid_series_chambr not valid")
    #piece
    series_piece = df[col_seq].apply(lambda x : has_piece(x))
    series_piece.loc[series_piece.notna()] = df.loc[mask_has_piece,col_tag].apply(lambda x : get_piece_tag(x))    
    series_piece.loc[series_piece.notna()] = series_piece.loc[series_piece.notna()].apply(lambda x : int(get_num(x)) if x else pd.NA)
    is_valid_series_piece = series_piece.dropna().apply(lambda x : isinstance(x,int)).all()
    if not is_valid_series_piece :
        raise ValueError("is_valid_series_piece not valid")
    #conditional extraction
    series_floor_tag = df[col_seq].apply(lambda x: has_floor(x))
    mask_floor_any = series_floor_tag.fillna(0).astype(bool)
    series_floor_tag.loc[mask_floor_any] = df.loc[mask_has_floor, col_tag].apply(lambda x: get_floor_tag(x) if x else pd.NA)
    series_floor_tag.loc[mask_floor_any] = series_floor_tag.loc[mask_floor_any].apply(lambda x: split_floor_tag(x) if isinstance(x, str) else (0, 0))
    idx = ~mask_floor_any
    series_floor_tag.loc[idx] = [(0, 0)] * int(idx.sum())
    # validate
    is_valid_series_floor = series_floor_tag.apply(lambda x: isinstance(x, tuple) and len(x) == 2).all()
    if not is_valid_series_floor :
        raise ValueError("is_valid_series_floor not valid")
    if len(series_floor_tag) == len(series_chambr) == len(series_piece) == len(series_dispo) == len(series_neuf) :
        out = df.copy(deep=True)
        out["DISPONIBL MAINTEN"] = series_dispo
        out["NEUF"] = series_neuf
        out["CHAMBRE"] = series_chambr
        out["PIECE"] = series_piece
        out[["FLOOR","FLOORS"]] = pd.DataFrame(series_floor_tag.tolist(), index=df.index)
        return out
    else : 
        raise ValueError("len don't match")#len -> size.

In [38]:
"""
- creating linear regression model to predict missing values for CHAMBR,PIECE based on AREA.
- why linear regression model ? 
    + cuz it's num or area is relative to chambre and piece for residential properties,
    meaning more area -> more pieces -> more chambres. (that's the relation this code assume)
    + we are not going to build the linear regression model from scratch because it's gonna take time
    ( i have more thing to do other then sticking my face to the screen all day. 
        + ps btw i only had less than 1 week to scrape data of seloger.com (got blocked couple times) and k-means applied model training.
        + k-means was built from scratch the week before.
- we have 3 models, one
"""
def round_int(arr):
    a = np.asarray(arr, dtype =float)#convert arr to np.array, and cast type to float
    return np.round(a).astype(int)#round and convert them to integer
def as_float(arr):
    return np.asarray(arr, dtype=float)#convert arr to np.array, and cast type to float

def make_model(x_cols, alpha=1.0):#just creating model from the lib i imported, to know details, search youtube, or on the website direct.
    pre = ColumnTransformer([
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy = "median")),#fill missing value with median (most appear value)
            ("sc", StandardScaler()),#standardize features, mean to rescale values by (value-mean)/standard_deviation, standardScaler is the name of the method to scale.
        ]), x_cols)
    ])
    return Pipeline([("pre", pre), ("reg", Ridge(alpha=alpha))])
#create + train models, LRM -> Linear Regression Model :)
def train_LRM(df, alpha_piece=1.0, alpha_ch_direct=1.0, alpha_ch_chain=1.0):#care only about input df, the rest are default settings.
    df = df.copy()
    for c in ["AREA", "PIECE", "CHAMBRE"]:
        df[c] = pd.to_numeric(df[c], errors ="coerce")

    #drop them just in case, most likely when we train the model, we dropped them before.
    df_piece = df.dropna(subset = ["PIECE"])
    df_ch_dir = df.dropna(subset = ["CHAMBRE"])
    df_ch_chain = df.dropna(subset = ["CHAMBRE", "PIECE"])

    #train models.
    m_piece_from_area = make_model(["AREA"], alpha = alpha_piece)
    m_piece_from_area.fit(df_piece[["AREA"]], df_piece["PIECE"])#to predict piece from area
    
    m_chambre_from_area = make_model(["AREA"], alpha = alpha_ch_direct)
    m_chambre_from_area.fit(df_ch_dir[["AREA"]], df_ch_dir["CHAMBRE"])#to predict chambre from area

    m_chambre_from_area_piece = make_model(["AREA", "PIECE"], alpha = alpha_ch_chain)#take area + piece to predict chambre
    m_chambre_from_area_piece.fit(df_ch_chain[["AREA", "PIECE"]], df_ch_chain["CHAMBRE"])
    #return models
    models = {
        "piece_from_area": m_piece_from_area,
        "chambre_from_area": m_chambre_from_area,
        "chambre_from_area_piece": m_chambre_from_area_piece,
    }
    return models
#create predicting funcs
def predict_A2P(x, models):
    try:
        X_area = pd.DataFrame([{"AREA": float(x)}])
        pred = float(models["piece_from_area"].predict(X_area)[0])
        return int(round(max(1.0, pred)))
    except Exception as e:
        raise ValueError(f"Error predicting AREA to PIECE: {e}")
def predict_A2C(x, models):
    try:
        X_area = pd.DataFrame([{"AREA": float(x)}])
        pred = float(models["chambre_from_area"].predict(X_area)[0])
        return int(round(max(0.0, pred)))
    except Exception as e:
        raise ValueError(f"Error predicting AREA to CHAMBRE: {e}")
def predict_AP2C(area, piece, models):
    try:
        X_ap = pd.DataFrame([{"AREA": float(area), "PIECE": float(piece)}])
        pred = float(models["chambre_from_area_piece"].predict(X_ap)[0])
        pred = max(0.0, min(float(piece), pred))  # 0 ≤ CHAMBRE ≤ PIECE
        return pred
    except Exception as e:
        raise ValueError(f"Error predicting (AREA,PIECE) to CHAMBRE: {e}")

In [40]:
def lin_predict_features(df, models):
    out = df.copy(deep=True)
    #create masks
    mask_both_missing = (out["PIECE"].isna() & out["CHAMBRE"].isna())
    mask_piece_missing = (out["PIECE"].isna() & out["CHAMBRE"].notna())
    mask_chambre_missing = (out["CHAMBRE"].isna() & out["PIECE"].notna())
    #bool for any missing chambre
    if mask_chambre_missing.any():
        X_AP = out.loc[mask_chambre_missing, ["AREA", "PIECE"]].astype(float)
        y_hat = models["chambre_from_area_piece"].predict(X_AP).astype(float)
        #clip and round
        y_hat = np.maximum(0.0, np.minimum(X_AP["PIECE"].to_numpy(), y_hat))
        out.loc[mask_chambre_missing, "CHAMBRE"] = np.round(y_hat).astype(int)
    #cbool for any both chambre and piece missing
    if mask_both_missing.any():
        #get area vals for rows miss both n convert to float (area is already float type) and convert 1D series to 2D dataframe
        area_vals = out.loc[mask_both_missing, "AREA"].astype(float).to_frame()
        #get prediction
        p_hat = models["piece_from_area"].predict(area_vals).astype(float)
        p_hat = np.maximum(1.0, p_hat)
        p_hat_round = np.round(p_hat).astype(int)
        out.loc[mask_both_missing, "PIECE"] = p_hat_round

        # chambre_hat with chained model
        X_for_ch = pd.DataFrame({
            "AREA": area_vals["AREA"].values,
            "PIECE": p_hat  # use float p_hat inside model
        })
        c_hat = models["chambre_from_area_piece"].predict(X_for_ch).astype(float)
        c_hat = np.maximum(0.0, np.minimum(p_hat, c_hat))
        out.loc[mask_both_missing, "CHAMBRE"] = np.round(c_hat).astype(int)

    # 3) PIECE missing but CHAMBRE present simply : PIECE = CHAMBRE + 1
    if mask_piece_missing.any():
        out.loc[mask_piece_missing, "PIECE"] = (out.loc[mask_piece_missing, "CHAMBRE"].astype(float) + 1).astype(int)
    if len(out.loc[out["PIECE"] < out["CHAMBRE"]]) > 0 :
        out.loc[out["PIECE"] < out["CHAMBRE"],"CHAMBRE"] = out.loc[out["PIECE"] < out["CHAMBRE"],"PIECE"]
    return out

In [49]:
def vectorize_dataset_seloger(csv_file : str = None, df : pd.DataFrame = None, save : bool = False):
    if not csv_file and df is None:
        raise ValueError("Require dataset filepath or a dataframe")
    #vopen file
    if df is not None:
        raw_df = df.copy()
    else :
        try :
            raw_df = pd.read_csv(csv_file)
        except Exception :
            raise ValueError("Error converting file.csv to dataframe")
        raw_df_cols = raw_df.columns.tolist()
        #verify columns
        if raw_df_cols != ['href', 'title']:
            raise ValueError(f"Unexpected columns {raw_df_cols}")

    #1 extract price,unit-price,data-tags -------------------------------
    out_df = get_processed(raw_df["title"]) 
    #2 clean tags (remove numbers, normalize them)
    out_df = out_df.loc[out_df["DATA TAG"].notna()]
    out_df["CLEANED TAG"] = out_df["DATA TAG"].apply(
        lambda x : normalize_tags(x) if isinstance(x,(tuple,list,set)) else pd.NA
    )
    out_df = out_df.loc[out_df["CLEANED TAG"].notna()]
    #3 extract type
    out_df["TYPE"] = out_df["CLEANED TAG"].apply(
        lambda x : extract_type(x) if isinstance(x,(tuple,list,set)) else pd.NA
    )
    out_df = out_df.loc[out_df["TYPE"].notna()]

    #4 extract location(postcode)
    out_df = extract_loc(df=out_df,dict_map=STEM_TO_POSTCODES)
    out_df = out_df.loc[out_df["LOC"].notna()]
    if out_df["LOC"].apply(lambda x : len(x) == 1 if isinstance(x,(set)) else False).any():
        out_df["LOC"] = out_df["LOC"].apply(lambda x : next(iter(x)))
    else :
        raise ValueError("Multiple location in one row detected.")
    #5 extract area,terrain
    out_df = extract_area(out_df)
    out_df = out_df.loc[~(out_df["AREA"].isna() & out_df["TERRAIN"].isna())]
    out_df.loc[out_df["TERRAIN"].isna(),"TERRAIN"] = 0
    out_df.loc[out_df["AREA"].isna(),"AREA"] = 0
    out_df["AREA"] = out_df["AREA"].astype(float)
    out_df["TERRAIN"] = out_df["TERRAIN"].astype(float)
    #6 extract feature
    out_df = extract_additionals(out_df)
    out_df["FLOOR"] = out_df["FLOOR"].astype(int)
    out_df["FLOORS"] = out_df["FLOORS"].astype(int)
    #7 fill missing values for chambre and piece, use linear regression model
    models = train_LRM(out_df.dropna())
    out_df = lin_predict_features(df=out_df,models=models)
    out_df["CHAMBRE"] = out_df["CHAMBRE"].astype(int)
    out_df["PIECE"] = out_df["PIECE"].astype(int)
    out_df.loc[out_df["TYPE"] == "TERRAIN CONSTRUCTIBL A VENDR" ,["PIECE","CHAMBRE","FLOOR","FLOORS"]] =0
    if save :
        out_df.to_csv(f"v_{len(out_df)}.csv",index=False)
    return out_df

In [44]:
df1 = pd.read_csv("dataset/data_sample_1584.csv")
df2 = pd.read_csv("dataset/data_sample_4743.csv")
merged_df = pd.concat([df1, df2], axis=0, ignore_index=True)
len(merged_df)

6327

In [46]:
vectorized_dataset = vectorize_dataset_seloger(df=merged_df)

Got tag with no digit :  PARIS
Got tag with no digit :  PARIS


  mask_has_chambr = df[col_seq].apply(lambda x : has_chambr(list(x)) if isinstance(x,(tuple,list,set)) else pd.NA).fillna(0).astype(bool)
  mask_has_piece = df[col_seq].apply(lambda x : has_piece(list(x)) if isinstance(x,(tuple,list,set)) else pd.NA).fillna(0).astype(bool)
  series_floor_tag.loc[mask_floor_any] = df.loc[mask_has_floor, col_tag].apply(lambda x: get_floor_tag(x) if x else pd.NA)


#### Verify

In [107]:
vectorized_dataset.isna().sum()

RAW                  0
DATA TAG             0
PRICE                0
PRICE_UNIT           0
CLEANED TAG          0
TYPE                 0
LOC                  0
ALL AREA             0
AREA                 0
TERRAIN              0
DISPONIBL MAINTEN    0
NEUF                 0
CHAMBRE              0
PIECE                0
FLOOR                0
FLOORS               0
dtype: int64

In [109]:
vectorized_dataset.columns

Index(['RAW', 'DATA TAG', 'PRICE', 'PRICE_UNIT', 'CLEANED TAG', 'TYPE', 'LOC',
       'ALL AREA', 'AREA', 'TERRAIN', 'DISPONIBL MAINTEN', 'NEUF', 'CHAMBRE',
       'PIECE', 'FLOOR', 'FLOORS'],
      dtype='object')

In [111]:
"""
Verification for valid tags, there's some tags that's included in the output vectorized_dataset
Explanations : 
- DIVISIBLE was removed from filtering types process
- HOTEL PARTICULI A VENDR was later decided not to be used since our models focus on TERRAIN CONSTRUCTIBLE, MAISON, APPART (residential properties)
- TERRAIN is extracted to AREA,TERRAIN cols
"""
set(vectorized_dataset["CLEANED TAG"].explode().tolist()).symmetric_difference(valid_tokens)

{'DIVISIBL', 'HOTEL PARTICULI A VENDR', 'TERRAIN'}

In [113]:
#Verify type
set(vectorized_dataset["TYPE"].tolist())

{'APPART A VENDR',
 'DUPLEX A VENDR',
 'LOFT A VENDR',
 'MAISON A VENDR',
 'MAISON VILL A VENDR',
 'STUDIO A VENDR',
 'TERRAIN CONSTRUCTIBL A VENDR',
 'VILL A VENDR'}

In [115]:
#verify cols values
def get_max_val(df=vectorized_dataset,col=None):
    max_v = vectorized_dataset.loc[vectorized_dataset[col] == max(vectorized_dataset[col])]
    print(f"{col} : ",max_v[col].to_string(index = False))
    print("RAW : ",max_v["RAW"].iloc[0],"\n","*"*100)
check_cols = {"AREA","TERRAIN","CHAMBRE","PIECE","FLOOR","FLOORS","PRICE"}
for c in check_cols :
    get_max_val(col=c)

PIECE :  15
RAW :  Maison à vendre - Paris 17ème - 5 100 000 € - 15 pièces, 6 chambres, 500,3 m² 
 ****************************************************************************************************
FLOORS :  35
RAW :  Appartement à vendre - Neuf - Paris 15ème - 1 150 000 € - 4 pièces, 3 chambres, 93,7 m², Étage 16/35 
 ****************************************************************************************************
CHAMBRE :  9
RAW :  Maison à vendre - Paris 16ème - 7 400 000 € - 12 pièces, 9 chambres, 375 m² 
 ****************************************************************************************************
FLOOR :  34
RAW :  Appartement à vendre - Paris 13ème - 499 000 € - 4 pièces, 3 chambres, 77 m², Étage 34/34 
 ****************************************************************************************************
TERRAIN :  750.0
RAW :  Maison à vendre - Paris 7ème - 13 500 000 € - 13 pièces, 5 chambres, 300 m², 750 m² de terrain 
 ********************************************

In [117]:
def get_min_val(df=vectorized_dataset,col=None):
    max_v = vectorized_dataset.loc[vectorized_dataset[col] == min(vectorized_dataset[col])].iloc[0]
    print(f"{col} : ",max_v[col])
    print("RAW : ",max_v.iloc[0],"\n","*"*100)
#check_cols = {"AREA","TERRAIN","CHAMBRE","PIECE","FLOOR","FLOORS","PRICE"}
for c in check_cols :
    get_min_val(col=c)

PIECE :  0
RAW :  Terrain constructible à vendre - Montreuil - 450 000 € - 394 m² de terrain 
 ****************************************************************************************************
FLOORS :  0
RAW :  Duplex à vendre - Neuf - Saint-Cloud - 1 140 000 € - 4 pièces, 3 chambres, 100,8 m², RDC 
 ****************************************************************************************************
CHAMBRE :  0
RAW :  Appartement à vendre - Paris 10ème - 215 000 € - 1 pièce, 18 m², RDC/5 
 ****************************************************************************************************
FLOOR :  0
RAW :  Appartement à vendre - Paris 18ème - 1 550 000 € - 5 pièces, 4 chambres, 191,9 m², RDC/1 
 ****************************************************************************************************
TERRAIN :  0.0
RAW :  Appartement à vendre - Paris 1er - 560 000 € - 2 pièces, 1 chambre, 45 m², Étage 5/6 
 ********************************************************************************

In [119]:
#do piece less than chambre ?
len(vectorized_dataset.loc[vectorized_dataset["PIECE"] < vectorized_dataset["CHAMBRE"]])

0

In [121]:
vectorized_dataset["TYPE"].value_counts()

TYPE
APPART A VENDR                  4806
DUPLEX A VENDR                   234
MAISON A VENDR                   119
LOFT A VENDR                      22
MAISON VILL A VENDR               14
STUDIO A VENDR                     7
VILL A VENDR                       2
TERRAIN CONSTRUCTIBL A VENDR       2
Name: count, dtype: int64

In [123]:
mask = vectorized_dataset["TYPE"].apply(lambda x : x in ["APPART A VENDR","DUPLEX A VENDR","MAISON A VENDR"])
vectorized_dataset = vectorized_dataset.loc[mask]

In [125]:
#seem good, let's save
vectorized_dataset.to_csv(f"v_{len(vectorized_dataset)}.csv",index=False)