In [42]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import MinMaxScaler
import sklearn
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import joblib

def sort_csv_files(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    dfs = []
    for file in files:
        df = pd.read_csv(os.path.join(folder_path, file), low_memory=False)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

def filter_valfonc(df):
    mutation = df
    mutation = mutation[~mutation["valeurfonc"].isna()]
    mutation = mutation[mutation["valeurfonc"] >= 1000]
    mutation = mutation[mutation["valeurfonc"] <= 1000000]
    return mutation

def add_arrondissement(df):
    n_col = df["l_codinsee"].apply(lambda s: s[-4:-2] if s[2:4]=="75" else 0)
    #n_col = n_col.rename("arro")
    df["arro"] = n_col.astype("int")

def sum_surface(df):
    df["surf"] = (df["sbati"] 
                  + df["sbatmai"] 
                  + df["sbatapt"]
                  + df["sbatact"])

def create_min_max_scaler():
    scaler = StandardScaler()
    return scaler
    
def min_max_scale(df, scaler):
    df['valeurfonc'] = scaler.fit_transform(df['valeurfonc'].values.reshape(-1, 1))
    
def min_max_rescale(Series, scaler):
    Series = scaler.inverse_transform(Series.reshape(-1, 1))
    return Series

def X_y_split(df):
    features = ["anneemut", "moismut", "nblot", "sterr", "nbvolmut", "nblocmut",
                "nblocmai", "nblocapt", "nblocdep", "nblocact", "latitude", "longitude",
                "arro", "surf"
               ]
    X = df[features]
    y = df["valeurfonc"]
    return X, y

def training(X_train, Y_train, save=False, pkl_file_path=None): 
    from xgboost import XGBRegressor
    xgboost = XGBRegressor()
    xgboost.fit(X_train, Y_train)
    if save:
        joblib.dump(xgboost, pkl_file_path)
    return xgboost

def predict(xgboost, X_test):
    y_pred = xgboost.predict(X_test)
    return y_pred

def save_model(pkl_file_path):
    joblib.dump(xgboost, pkl_file_path)
    
def load_model(pkl_file_path):
    xgboost = joblib.load(pkl_file_path)
    return xgboost

# Let's define a function for each metrics
# R²
def rsqr_score(test, pred):
    """Calculate Root Mean Square Error score 

    Args:
        test -- test data
        pred -- predicted data

    Returns:
        Root Mean Square Error score
    """
    rsqr_ = r2_score(test, pred)
    return rsqr_

def rmse_score(test, pred):
    """Calculate Root Mean Square Error score 

    Args:
        test -- test data
        pred -- predicted data

    Returns:
        Root Mean Square Error score
    """
    rmse_ = np.sqrt(mean_squared_error(test, pred))
    return rmse_

def mse_score(test, pred):
    """Calculate Root Mean Square Error score 

    Args:
        test -- test data
        pred -- predicted data

    Returns:
        Root Mean Square Error score
    """
    mse_ = mean_squared_error(test, pred)
    return mse_

def mae_score(test, pred):
    """Calculate Mean Absolute Error score 

    Args:
        test -- test data
        pred -- predicted data

    Returns:
        Mean Absolute Error score
    """
    mae_ = mean_absolute_error(test, pred)
    return mae_

# Print the scores
def print_score(test, pred):
    """Print calculated score 

    Args:
        test -- test data
        pred -- predicted data

    Returns:
        print the R squared score
        print Root Mean Square Error score
        print Mean Square Error score
        print Mean Absolute Error score
    """

    print(f"R²: {rsqr_score(test, pred)}")
    print(f"RMSE: {rmse_score(test, pred)}")
    print(f"MSE: {mse_score(test, pred)}")
    print(f"MAE: {mae_score(test, pred)}")

In [5]:
from time import time

start = time()

path = "data_right_price/data_localisee/"
df_1 = sort_csv_files(path)
mutation = filter_valfonc(df_1)
add_arrondissement(mutation)
sum_surface(mutation)
scaler = create_min_max_scaler()
min_max_scale(mutation, scaler)
X, y = X_y_split(mutation)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

  df_1 = sort_csv_files(path)


In [11]:
xgboost = training(X_train, Y_train)
y_pred = predict(xgboost, X_test)

print(f"{round(time()-start, 2)}sec")

478.48sec


In [36]:
def min_max_rescale(Series, scaler):
    Series = scaler.inverse_transform(Series.reshape(-1, 1))
    return Series

In [34]:
min_max_rescale(Y_test.values, scaler)

array([[250000.],
       [400000.],
       [240000.],
       ...,
       [166990.],
       [412350.],
       [231900.]])

In [35]:
min_max_rescale(y_pred, scaler)

array([[232494.16],
       [372414.28],
       [235010.12],
       ...,
       [188442.95],
       [515082.8 ],
       [211262.1 ]], dtype=float32)

In [43]:
print_score(min_max_rescale(Y_test.values, scaler), \
            min_max_rescale(y_pred, scaler))

R²: 0.7389615416338329
RMSE: 93403.10814039613
MSE: 8724140610.286533
MAE: 58887.57002055114
