In [1]:
import pandas as pd
import sqlite3
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Carica dati dal database
conn = sqlite3.connect("../database/House_prices.db")
df = pd.read_sql_query("SELECT `X5 latitude` as latitude, `X6 longitude` as longitude, `Y house price of unit area` as price FROM Dati_Processati", conn)
conn.close()

X = df[["latitude", "longitude"]]
y = df["price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Alcuni modelli necessitano di scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest con parametri per ridurre overfitting
rf_params = {
    "n_estimators": 100,
    "max_depth": 10,
    "min_samples_split": 5,
    "min_samples_leaf": 2,
    "random_state": 42
}

# ALbero  con parametri per ridurre overfitting
dt_params = {
    "max_depth": 8,
    "min_samples_split": 4,
    "min_samples_leaf": 2,
    "random_state": 42
}

models = {
    "Random Forest": RandomForestRegressor(**rf_params),
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(**dt_params)
}

results = []

# Troviamo l'accuracy
def regression_accuracy(y_true, y_pred, tolerance=0.1):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_pred - y_true) / y_true) < tolerance)

for name, model in models.items():
    if name in ["Linear Regression", "Ridge Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

    train_acc = regression_accuracy(y_train, y_train_pred)
    test_acc = regression_accuracy(y_test, y_test_pred)

    results.append({
        "Model": name,
        "Train Accuracy": train_acc,
        "Test Accuracy": test_acc
    })

# Visualizza i risultati
results_df = pd.DataFrame(results).sort_values(by="Test Accuracy", ascending=False)
print(results_df)




               Model  Train Accuracy  Test Accuracy
3                KNN        0.558912       0.542169
0      Random Forest        0.628399       0.506024
4      Decision Tree        0.643505       0.457831
1  Linear Regression        0.353474       0.361446
2   Ridge Regression        0.356495       0.349398
