In [2]:
from __future__ import annotations

import os
import json
import argparse
import random
from dataclasses import dataclass
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

In [3]:
"""
Check numeric dtype: pandas.api.types.is_numeric_dtype(df["col"]) (also is_integer_dtype, is_float_dtype, is_bool_dtype).
Check string/object dtype: pandas.api.types.is_string_dtype(...), is_object_dtype(...).
Force/validate numeric: pd.to_numeric(df["col"], errors="raise") (or errors="coerce" to get NaN for non-numeric, then check isna).
Inspect dtype directly: df["col"].dtype (e.g., int64, float64, object).
Per-value numeric-ish: df["col"].str.isnumeric() (only works on strings; not as robust as dtype checks).
"""
def filter_cols(df : pd.DataFrame, cols : list):
    try :
        out = df[cols].copy()
        return out
    except Exception as e :
        raise ValueError(f"Error filtering DataFrame with {cols}: {e}")
        
def validate_and_cast_trainDs(df : pd.DataFrame, req : dict):
    #verify cols
    diff = set(req.keys()) - set(df.columns)
    if len(diff) != 0 :
        raise ValueError(f"Missing cols : {sorted(diff)}")
    filtered_df = filter_cols(df = df, cols = list(req.keys()))
    string_cols = [k for k,v in req.items() if v == "str" ]
    numeric_cols = [k for k,v in req.items() if v == "numeric"]
    #verify cols dtype
    for k,v in req.items():
        if v == "str" and k in string_cols:
            if k == "postalCode":
                s = filtered_df[k].astype("string")
                x = pd.to_numeric(s, errors="coerce")

                s_norm = np.where(
                    x.isna(),
                    s.str.strip(),
                    np.trunc(x).astype("Int64").astype("string")
                )
                filtered_df[k] = pd.Series(s_norm, index=filtered_df.index, dtype="string")
            else:
                filtered_df[k] = filtered_df[k].astype("string")
        elif v == "numeric" and k in numeric_cols:
            if not pd.api.types.is_numeric_dtype(filtered_df[k]):
                filtered_df[k] = pd.to_numeric(filtered_df[k], errors="coerce")
            filtered_df[k] = np.trunc(filtered_df[k]).astype(np.int64)
    try :
        filtered_df["label"] = pd.to_numeric(df["label"], errors="raise").astype(int)
        if not filtered_df["label"].isin([0, 1]).all():
            bad = filtered_df.loc[~filtered_df["label"].isin([0, 1]), "label"].unique().tolist()
            raise ValueError(f"Found non-binary labels: {bad}")
    except Exception :
        return filtered_df
    return filtered_df

In [4]:
def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [5]:
def build_input_text(postalCode: str, city: str, description: str, desc_max_chars: int) -> str:
    """
    What: Build a single text string that contains all signals.
    Why: The model only sees "one sequence of tokens", so we must include structured fields.

    We keep preprocessing LIGHT to preserve cues:
    - Keep punctuation, accents, line breaks (often meaningful).
    - Truncate description to limit compute.
    """
    postalCode = "" if postalCode is None or pd.isna(postalCode) else str(postalCode)
    city = "" if city is None or pd.isna(city) else str(city)
    description = "" if description is None or pd.isna(description) else str(description)

    # light normalization
    description = description.replace("\r\n", "\n").replace("\r", "\n")
    if desc_max_chars > 0:
        description = description[:desc_max_chars]

    # "field tags" help the model separate columns
    return f"[POSTAL] {postalCode}\n[CITY] {city}\n[DESC]\n{description}"

In [6]:
def prepareDataset(data_path : str, req : dict, desc_max_chars: int = 2000):
    try :
        raw_df = pd.read_csv(data_path)
    except :
        raise ValueError(f"Failed to open ds {data_path} *only csv file*")
    try :
        valid_df = validate_and_cast_trainDs(df = raw_df, req = req)
    except Exception as e :
        raise valueError(f"Failed to valid and cast dataset. {e}")
    valid_df["input_text"] = valid_df.apply(lambda r :
        build_input_text(postalCode = r.get("postalCode",""),
                        city = r.get("city",""),
                        description = r.get("description",""),
                        desc_max_chars = desc_max_chars
                        ),
                       axis = 1
                      )
    return valid_df

In [7]:
class flagLocation_dataset(torch.utils.data.Dataset):
    """
    encodings example :
    encodings = {
      "input_ids": [
        [10, 11, 12, 0],  # row 0
        [20, 21,  0, 0],  # row 1
      ],
      "attention_mask": [
        [1, 1, 1, 0],     # row 0
        [1, 1, 0, 0],     # row 1
      ]
    }
    each input_id is a sentence that's tokenized and assigned to an integer.
    each attention_mask is a mask to indicate which token is padding vs real. padding is to artficially size the text length to max length.
    """
    def __init__(self, encodings : dict[str, list[list[int]]], labels : np.ndarray):
        #data loading
        self.encodings = encodings
        self.labels = labels.astype(int)
    def __getitem__(self,index):
        #get data sample at index
        item = {k : torch.tensor(v[index]) for k,v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        #get dataset length
        return len(self.labels)

In [8]:
REQUIRED_COLS = {
    "postalCode" : "str",
    "city" : "str",
    "description" : "str",
    "label" : "numeric"
}

In [35]:
def get_tokenizer_and_model(model_name : str, num_labels : int):
    tokenizer =  AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)
    return tokenizer,model

In [53]:
def train_model(
    model,
    tokenizer,
    train_df,
    out_dir,
    val_df=None,
    lr=2e-5,
    train_bs=16,
    eval_bs=32,
    epochs=3,
    weight_decay=0.01,
    seed=42,
    fp16=None,
    logging_steps=50,
    max_len=256,
):
    use_fp16 = torch.cuda.is_available() if fp16 is None else fp16

    # encode train
    train_enc = tokenizer(
        train_df["input_text"].tolist(),
        truncation=True,
        max_length=max_len,
    )
    train_ds = flagLocation_dataset(train_enc, train_df["label"].values)

    # optional val
    val_ds = None
    if val_df is not None and not val_df.empty:
        val_enc = tokenizer(
            val_df["input_text"].tolist(),
            truncation=True,
            max_length=max_len,
        )
        val_ds = flagLocation_dataset(val_enc, val_df["label"].values)

    eval_strategy = "epoch" if val_ds is not None else "no"

    train_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=lr,
        per_device_train_batch_size=train_bs,
        per_device_eval_batch_size=eval_bs,
        num_train_epochs=epochs,
        weight_decay=weight_decay,
        evaluation_strategy=eval_strategy,
        save_strategy=eval_strategy,
        load_best_model_at_end=val_ds is not None,
        metric_for_best_model="f1",
        greater_is_better=True,
        seed=seed,
        fp16=use_fp16,
        report_to="none",
        logging_steps=logging_steps,
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics if val_ds is not None else None,
    )

    trainer.train()
    metrics = trainer.evaluate() if val_ds is not None else None
    return trainer, metrics


In [56]:
def save_model(trainer,tokenizer,out_dir:str):
    trainer.save_model(out_dir)
    tokenizer.save_pretrained(out_dir)

In [89]:
def get_trainedModel(
    csv_path: str,
    out_dir: str = "",
    model_name: str = "cmarkea/distilcamembert-base",
    max_len: int = 256,
    desc_max_chars: int = 2000,
    test_size: float = 0.15,
    seed: int = 7,
    epochs: int = 3,
    lr: float = 2e-5,
    train_bs: int = 16,
    eval_bs: int = 32,
    weight_decay: float = 0.01,
):
    set_seed(seed)

    # validate/prepare data
    df = prepareDataset(
        data_path=csv_path,
        req=REQUIRED_COLS,
        desc_max_chars=desc_max_chars,
    )
    # tokenizer + model
    tokenizer, model = get_tokenizer_and_model(
        model_name=model_name,
        num_labels=len(df["label"].unique()),
    )
    # split
    train_df, val_df = train_test_split(
        df,
        test_size=test_size,
        random_state=seed,
        stratify=df["label"],
    )
    # train via helper
    trainer, metrics = train_model(
        model=model,
        tokenizer=tokenizer,
        train_df=train_df,
        val_df=val_df,
        out_dir=out_dir,
        lr=lr,
        train_bs=train_bs,
        eval_bs=eval_bs,
        epochs=epochs,
        weight_decay=weight_decay,
        seed=seed,
    )
    # save
    save_model(trainer=trainer,tokenizer=tokenizer,out_dir=out_dir)
    print("\nSaved model to:", out_dir)
    return trainer, metrics


In [42]:
train_textModel(out_dir = "models/location_flagger_v2")

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at cmarkea/distilcamembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0466,0.00196,1.0,1.0,1.0,1.0
2,0.0016,0.000835,1.0,1.0,1.0,1.0
3,0.0011,0.000672,1.0,1.0,1.0,1.0



Saved model to: models/location_flagger_v2


In [11]:

model_dir = "models/location_flagger_v2"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.eval()

text = """[POSTAL] 75005\n[CITY] Paris 5e\n[DESC]Paris.\nAppartement 2 pi√®ces, 38m¬≤, proche Jardin des Plantes.\nM√©tro Place Monge √† 3 minutes.
"""
enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
with torch.no_grad():
    logits = model(**enc).logits
    probs = torch.softmax(logits, dim=-1)
    p_contradiction = probs[0, 1].item()

print("P(contradiction) =", p_contradiction)

P(contradiction) = 0.9946152567863464


In [26]:
df = prepare_trainDs("ressources/data/location_match_dataset_2000.csv",req=REQUIRED_COLS)

In [35]:
df.loc[df["label"] == 1, "input_text"].reset_index(drop=True)[0]

"[POSTAL] 75005\n[CITY] Paris 5e\n[DESC]\n√Ä 20 minutes de Paris en transports, quartier r√©sidentiel.\nRUE DE RIVOLI 75005 PARIS 5E\nAu 3√®me √©tage avec ascenseur, nous vous proposons un appartement 4 pi√®ces non meubl√© de 105.68m¬≤ au sol, 95.6m¬≤ loi Carrez comprenant : entr√©e, s√©jour, cuisine am√©nag√©e et √©quip√©e, chambre(s), salle d'eau et WC.\nPas de cave, pas de parking.\n\nLoyer 1369‚Ç¨ hors charges + 220‚Ç¨ de charges soit 1589‚Ç¨ CC.\n\nLa production de chauffage et d'eau chaude est collectif.\n\nMERCI DE NOUS CONTACTER UNIQUEMENT PAR MAIL VIA L'ANNONCE.\n"

In [70]:
def model_predict(trainer, tokenizer, input_text, max_len=256, thresh=0.85):
    enc = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_len)
    device = trainer.model.device
    enc = {k: v.to(device) for k, v in enc.items()}
    trainer.model.eval()
    with torch.no_grad():
        logits = trainer.model(**enc).logits
        probs = torch.softmax(logits, dim=-1)
        p_contradiction = probs[0, 1].item()
    return 1 if p_contradiction >= thresh else 0,p_contradiction

In [13]:
def flag_location(df:pd.DataFrame, flagModel):
    out = df.copy()
    out["falseLocationFlag"] = out["input_text"].apply(lambda x : model_predict(flagModel = flagModel, input_text = x))
    return out

In [66]:
bienIci_df = pd.read_csv("ressources/data/bienIci_2458.csv")

In [14]:
bienIci_df = prepareDataset(
    data_path = "ressources/data/bienIci_2458.csv",
    req = {
    "postalCode" : "str",
    "city" : "str",
    "description" : "str"}
)

In [31]:
test_df = bienIci_df.iloc[:100].copy()
flagged_df = flag_location(df = test_df,flagModel = model)

In [15]:
bienIci_df.columns

Index(['postalCode', 'city', 'description', 'input_text'], dtype='object')

In [33]:
"""
for i in range(len(flagged_df[flagged_df["falseLocationFlag"]==1])):
    print(flagged_df.loc[flagged_df["falseLocationFlag"]==1,"input_text"].iloc[i])
    print("-"*50)
"""    

'\nfor i in range(len(flagged_df[flagged_df["falseLocationFlag"]==1])):\n    print(flagged_df.loc[flagged_df["falseLocationFlag"]==1,"input_text"].iloc[i])\n    print("-"*50)\n'

In [41]:
trainds_22dec25 = prepareDataset(data_path = "ressources/data/location_match_dataset_21dec25.csv",req = REQUIRED_COLS)

In [42]:
trainds_22dec25

Unnamed: 0,postalCode,city,description,label,input_text
0,75015,Paris 15e,Situ√© rue Cambronne √† proximit√© de toutes les ...,0,[POSTAL] 75015\n[CITY] Paris 15e\n[DESC]\nSitu...
1,75008,Paris 8e,Bail code civil\nCet appartement haut de gamme...,0,[POSTAL] 75008\n[CITY] Paris 8e\n[DESC]\nBail ...
2,75014,Paris 14e,Bail code civil\nCe magnifique appartement de ...,0,[POSTAL] 75014\n[CITY] Paris 14e\n[DESC]\nBail...
3,75004,Paris 4e,R√©sidence principale\n\n√Ä deux pas de la cath√©...,0,[POSTAL] 75004\n[CITY] Paris 4e\n[DESC]\nR√©sid...
4,75014,Paris 14e,√Ä LOUER : Appartement r√©nov√© T2 situ√© dans le ...,0,[POSTAL] 75014\n[CITY] Paris 14e\n[DESC]\n√Ä LO...
5,75014,Paris 14e,"Situ√© rue de Coulmiers, ce charmant appartemen...",0,[POSTAL] 75014\n[CITY] Paris 14e\n[DESC]\nSitu...
6,75116,Paris 16e,Bail code civil\nId√©alement situ√© sur l'Avenue...,0,[POSTAL] 75116\n[CITY] Paris 16e\n[DESC]\nBail...
7,75019,Paris 19e,CO-LOCATION COLOCATION COLIVING\nCooloc vous p...,0,[POSTAL] 75019\n[CITY] Paris 19e\n[DESC]\nCO-L...


In [54]:
model_22dec25 =  train_model(
    model = model ,
    tokenizer = tokenizer,
    train_df = trainds_22dec25,
    out_dir = "models/location_flagger_22dec25")

  trainer = Trainer(
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss


In [73]:
trainer,metrics = model_22dec25

In [74]:
flag,prob = model_predict(trainer=trainer,tokenizer = tokenizer,input_text=trainds_22dec25["input_text"].iloc[0])
print(flag,prob)

1 0.9957684278488159


In [76]:
df = pd.read_csv("ressources/out_data/bienIci_2458_labeled.csv")

In [77]:
df.loc[df["label"]==1,["postalCode","description"]]

Unnamed: 0,postalCode,description
0,75015,Situ√© rue Cambronne √† proximit√© de toutes les ...
2,75008,Bail code civil\nCet appartement haut de gamme...
3,75014,Bail code civil\nCe magnifique appartement de ...
4,75004,R√©sidence principale\n\n√Ä deux pas de la cath√©...
5,75014,√Ä LOUER : Appartement r√©nov√© T2 situ√© dans le ...
...,...,...
2446,75019,BOULEVARD MACDONALD √† proximit√© du tram canal ...
2447,75017,"Appartement 2 pi√®ces de 35,93m2 situ√© √† Paris ..."
2448,75006,"Entre la place Saint-Michel et l'Od√©on, rue Su..."
2450,75016,"Dans dans une impasse calme et s√©curis√©e, appa..."


In [78]:
df = pd.read_csv("ressources/data/bienIci_2458.csv")

In [81]:
mask_notna = df["postalCode"].notna() & df["city"].notna()
df_filtered = df.loc[mask_notna,["postalCode","city","description"]].copy()
df_filtered.isna().sum()

postalCode     0
city           0
description    0
dtype: int64

In [82]:
df_filtered.to_csv("ressources/out_data/filtered_cols.csv")

In [83]:
synth_traindata = pd.read_csv("ressources/data/synth_paris_lyon_marseille_30k.csv")

In [87]:
synth_traindata["description"].iloc[0]

"IMMO CONSEIL vous propose, un appartement meubl√© de 49 m¬≤ (3 pi√®ces) au 7e √©tage avec ascenseur.\nSitu√© √† Paris 75020 ‚Ä¢ Quartier Batignolles\nIl se compose d'une entr√©e, d'un s√©jour, d'une cuisine ind√©pendante am√©nag√©e, 2 chambre(s), d'une salle d'eau et de WC.\nPrestations : balcon.\nDisponible le 15/03/2026.\n\nContact agence : 04 62 62 35 96 | contact@agence-demo.com\nAdresse de l‚Äôagence : 60 Rue Victor Hugo, 13013 Marseille\nLoyer hors charges : 1192 ‚Ç¨\nProvision pour charges : 159 ‚Ç¨\nLoyer charges comprises : 1351 ‚Ç¨\nD√©p√¥t de garantie : 1192 ‚Ç¨\nLes informations sur les risques auxquels ce bien est expos√© sont disponibles sur le site G√©orisques : www.georisques.gouv.fr"

In [88]:
df["description"].iloc[0]

"Situ√© rue Cambronne √† proximit√© de toutes les commodit√©s, un appartement meubl√© au 2√®me √©tage totalement refait √† neuf.\nIl se compose d'une entr√©e avec un placard de rangement, un cuisine ouverte et √©quip√©e donnant sur la salle √† manger et le s√©jour, une chambre donnant sur cour avec penderie, une salle d'eau et un wc.\nCet appartement meubl√© de 45m2 comprend tout le charme de l'ancien (parquet, moulures, chemin√©es)\n\nDisponible imm√©diatement.\n\nZone soumise √† encadrement des loyers\nLoyer de r√©f√©rence major√© (loyer de base √† ne pas d√©passer) : 1645‚Ç¨\nLoyer de base : 1675‚Ç¨ par mois\nProvision pour charges : 96‚Ç¨\nSoit 1771‚Ç¨ charges comprises\n\nD√©pot de garantie: 3290‚Ç¨"