In [17]:
import numpy as np
import matplotlib.pyplot as plt
import math
import pandas as pd
from k_means import get_shortestDistancePairs, run_monteCarlo, get_lowest_inertia_centroids

#### Analyze/predict real-world data

### How to use ? :
#### - get_model_SL , prepare one vectorized dataset, must be in correct format (use vectorized_dataset_seloger.ipynb)
    #### - k -> number of centroids, increase to obtain more centroids (sometime more accurate, sometime overfit)
    #### - runs -> number of randomized intializations of centroids, look at k-alorithms.ipynb for more details.
    #### - save_model : save it or not
    #### - model_title : name the model

#### - model_SL_predict, load model and predict, pass in either model filepath or model dataframe.
    #### - data -> a dict need to contains 2 keys "CAT" and "POINT", "CAT" value need to be included in model df, check by typing model["CAT"].unique(), or use app.py which will help you use model to predict with terminal interface.
    #### - "POINT" contain 8 numerical values correspond in oredered to [area-terrain-dispo-neuf-chambre-piece-floor-floors]

In [83]:
def get_model_SL(csv_file:str = None,k : int = 32, runs : int = 100,save_model : bool = False, model_title : str = "model_sl"):
    if not csv_file :
        raise ValueError("Require dataset filepath")
    #vopen file
    try :
        df = pd.read_csv(csv_file)
    except Exception :
        raise ValueError("Error converting file.csv to dataframe")
    
    try :
        #get useful cols
        cols = ['PRICE','TYPE','LOC','AREA','TERRAIN','DISPONIBL MAINTEN','NEUF',
            'CHAMBRE','PIECE','FLOOR','FLOORS']
        cat_cols = ['TYPE','LOC']
        num_cols = ['AREA','TERRAIN','DISPONIBL MAINTEN','NEUF',
            'CHAMBRE','PIECE','FLOOR','FLOORS']
        out = df.loc[:,cols]
        
        """
        How to decide number of ks ?
        maybe based on number of features,
        we have 2 categorical features, 8 numerical features,
        for each numerical feature, we divide each into 2 types, 1 good, 1 bad. (can try 1 good, 1 medium, 1 bad later)
        -> 8*2 = 16, k = 16.
        """
        for c in cat_cols :
            out[c] = out[c].astype(str)
        #get categorical data
        def get_value_list(df,mask,col):
            return df.loc[mask,col].tolist()
        typs = set(out["TYPE"].tolist())
        locs = set(out["LOC"].tolist())
        for loc in locs :
            for typ in typs :
                mask = (out["TYPE"].eq(typ)) & (out["LOC"].eq(loc))
                df_by_cat = out.loc[mask]
                if len(df_by_cat) >= k*2 :
                    out.loc[mask,"POINT"] = out.loc[mask,num_cols].agg(tuple,axis=1)
                    pts = out.loc[mask,"POINT"].tolist()
                    centroids = get_lowest_inertia_centroids(run_monteCarlo(k=k,nb_runs=runs,points=pts))
                    clusters = get_shortestDistancePairs(classified_pts=centroids,unclassified_pts=pts)
                    out.loc[mask,"CENTROID"] = out.loc[mask,"POINT"].apply(lambda x : clusters[x] if x else pd.NA)
                    out.loc[mask,"CENTROID W"] = len(df_by_cat)
                else : 
                    continue
        out["CENTROID"] = out["CENTROID"].apply(
            lambda v: tuple(v) if isinstance(v, (list, np.ndarray)) else v
        )
        centroids = set(out["CENTROID"].dropna().tolist())
        for c in centroids :
            prices = out.loc[out["CENTROID"].isin([c]), "PRICE"].tolist()
            avg = sum(prices) / len(prices)
            out.loc[out["CENTROID"].isin([c]), "PRED"] = avg
        out["CAT"] = out.loc[:,["TYPE","LOC"]].agg(tuple,axis=1)
        if save_model :
            #out.to_csv(f"{model_title}_{runs}.csv",index=False)
            save = out.copy(deep=True)
            save = save.dropna()
            for c in ("POINT", "CENTROID", "CAT"):
                if c in save.columns:
                    save[c] = save[c].astype("object").apply(lambda x : list(x) if x else pd.NA)
            save = save.dropna()
            save.to_parquet(f"{model_title}_K{k}R{runs}L{len(save)}.parquet",index=False)
    except Exception :
        raise ValueError("Incompatible dataset")
    return out.dropna()
    #return centroids,weight

In [85]:
def model_SL_predict(parquet_model : str = None, model : pd.core.frame.DataFrame = None ,data : dict = None ):
    """
    data : dict -> shape = {str:tuple,str:tuple} keys() = {"CAT","POINT"}
    out -> float
    """
    #safety check
    if model is None and not parquet_model :
        raise ValueError("No model provided")
    if not data :
        return None
    if parquet_model :
        try :
            model = pd.read_parquet(parquet_model)
            for c in set(model.columns) :       
                model[c] = model[c].apply(lambda x : tuple(x.tolist()) if isinstance(x,np.ndarray) else x)
                model[c] = model[c].apply(lambda x : tuple(x) if isinstance(x,list) else x)
        except Exception :
            raise ValueError("Error converting model_sl.csv to dataframe")
    model.dropna(inplace = True)
    #get model capability to predict (have predictable input category)
    if sorted(set(data.keys())) != sorted({"CAT","POINT"}):
        raise ValueError("Invalid data keys")
    predictable_cat = set(model["CAT"].tolist())
    if (data["CAT"] not in predictable_cat) or (len(data["CAT"]) != 2):
        raise ValueError("Incompatible category")
    if len(data["POINT"]) != 8 :
        raise ValueError("Incompatible point")
    mask = model["CAT"].apply(lambda x : sorted(x) == sorted(data["CAT"]) if x else False)
    centroids = set(model.loc[mask,"CENTROID"].tolist())
    pair = get_shortestDistancePairs(classified_pts=centroids,unclassified_pts=[data["POINT"]])
    key = tuple(map(float, data["POINT"]))
    nearest = pair[key]
    pred = set(model.loc[model["CENTROID"].isin([nearest]), "PRED"].tolist())
    return pred

In [87]:
model = get_model_SL(csv_file="dataset/v_5159.csv",runs=10,save_model=True,model_title="models/model_sl")

In [88]:
model.head()

Unnamed: 0,PRICE,TYPE,LOC,AREA,TERRAIN,DISPONIBL MAINTEN,NEUF,CHAMBRE,PIECE,FLOOR,FLOORS,POINT,CENTROID,CENTROID W,PRED,CAT
0,560000.0,APPART A VENDR,75001,45.0,0.0,0,0,1,2,5,6,"(45.0, 0.0, 0.0, 0.0, 1.0, 2.0, 5.0, 6.0)","(44.285714285714285, 0.0, 0.0, 0.0, 0.85714285...",74.0,628166.7,"(APPART A VENDR, 75001)"
1,849000.0,APPART A VENDR,75012,83.5,0.0,0,0,2,4,1,7,"(83.5, 0.0, 0.0, 0.0, 2.0, 4.0, 1.0, 7.0)","(83.12, 0.0, 0.0, 0.1, 2.3, 3.8, 2.6, 9.5)",210.0,786900.0,"(APPART A VENDR, 75012)"
2,1550000.0,APPART A VENDR,75018,191.9,0.0,0,0,4,5,0,1,"(191.9, 0.0, 0.0, 0.0, 4.0, 5.0, 0.0, 1.0)","(233.70000000000002, 0.0, 0.0, 0.0, 4.66666666...",447.0,3113333.0,"(APPART A VENDR, 75018)"
3,620000.0,APPART A VENDR,75005,51.0,0.0,0,0,2,3,1,6,"(51.0, 0.0, 0.0, 0.0, 2.0, 3.0, 1.0, 6.0)","(50.528571428571425, 0.0, 0.0, 0.0, 1.42857142...",118.0,676285.7,"(APPART A VENDR, 75005)"
5,1180000.0,APPART A VENDR,75016,109.0,0.0,0,0,2,4,2,7,"(109.0, 0.0, 0.0, 0.0, 2.0, 4.0, 2.0, 7.0)","(109.51538461538462, 0.0, 0.0, 0.0, 2.69230769...",610.0,1429333.0,"(APPART A VENDR, 75016)"


In [89]:
len(model)

4800

In [90]:
cat = ("APPART A VENDR", "75018")
pt = (52.5875, 0.0, 0.0, 0.0, 1.25, 2.5, 3.75, 4.125)

In [91]:
pred = model_SL_predict(model = model,data = {"CAT": cat,"POINT":pt})
print(pred)

{561718.75}
