In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
raw_data = pd.read_csv("data/train.csv")
print(raw_data.info())

Filter only some relevant features

In [None]:
selected_features = ["LotArea", "YearBuilt", "YearRemodAdd", "YrSold", "BedroomAbvGr", "SalePrice"]
selected_features=raw_data.columns.to_list()
data = raw_data[selected_features].copy()
selected_features.remove("SalePrice")
print(data.info())
print(data.describe())

In [None]:
print(data.isna().sum())
print(data.isnull().sum())

In [None]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()

edges = [0, 1970, 1990, 2000, 2050]
labels = ["a", "b", "c", "d"]
data["YearRemodAdd"] = pd.cut(data['YearRemodAdd'], edges, labels=labels)

edges = [0, 1900, 1950, 1980, 2000, 2010, 2050]
labels = ["a", "b", "c", "d", "e", "f"]
data["YearBuilt"] = pd.cut(data['YearBuilt'], edges, labels=labels)

edges = [0., 1.1, 2.1, 3.1, 4.1, 10.]
labels = ["a", "b", "c", "d", "e"]
data["BedroomAbvGr"] = pd.cut(data['BedroomAbvGr'], edges, labels=labels)

#data['PoolArea'] = data['PoolArea'].isin([0]).astype(int)
'''data["OpenPorchSF"] = np.log(1 + data["OpenPorchSF"])
data["EnclosedPorch"] = np.log(1 + data["EnclosedPorch"])
data["3SsnPorch"] = np.log(1 + data["3SsnPorch"])
data["ScreenPorch"] = np.log(1 + data["ScreenPorch"])'''
#data["LotFrontage"] = np.log(1 + data["LotFrontage"])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Supponiamo di avere un DataFrame chiamato 'df'
# df = pd.read_csv('tuo_dataset.csv')

# Calcolare la matrice di correlazione
corr_matrix = data[data.select_dtypes(include=[np.number]).columns].corr()

# Creare il grafico della matrice di correlazione con seaborn
plt.figure(figsize=(10, 8))  # Puoi modificare la dimensione in base alle tue esigenze
sns.heatmap(corr_matrix, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Aggiungere un titolo
plt.title('Matrice di Correlazione')

# Mostrare il grafico
plt.show()

In [None]:
for f in data.columns:
    print(f)
    print(data[f].value_counts())
    try:
        plt.hist(data[f])
        plt.title(f)
        plt.show()
    except:
        pass


In [None]:
data["SalePrice"] = np.log(data["SalePrice"])

plt.hist(data["SalePrice"])
plt.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from scipy.stats import mode

# imputation pipelines
log_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('log', FunctionTransformer(np.log, feature_names_out='one-to-one')),
    ('std', StandardScaler())
])
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('std', StandardScaler())
])
obj_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

def binarizer(X):
    most_frequent = pd.Series(X.ravel()).mode()[0]  # Trova il valore più frequente
    return np.where(X == most_frequent, 1, 0).reshape(-1, X.shape[1])
    
binary_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('binary', FunctionTransformer(binarizer, feature_names_out='one-to-one')),
    ('onehot', OneHotEncoder())
])

In [None]:
std_cols = ["LotArea"]
category_cols = ["YearBuilt", "MiscVal"]

minmax_cols = data.select_dtypes(include=np.number).columns.drop("SalePrice")
category_cols = data.select_dtypes(include=object).columns

In [None]:
from sklearn.compose import ColumnTransformer

y_train = data['SalePrice'].astype(dtype=float)
data = data.drop('SalePrice', axis=1)

# Remove target variable from categorical columns
binary_columns = ['PoolArea', 'RoofMatl']
obj_columns = data.select_dtypes(exclude=[int, float]).columns.drop(binary_columns, errors='ignore')
log_columns = ["LotArea", "GrLivArea", "LotFrontage"]

# Combine transformers using ColumnTransformer
preprocessing = ColumnTransformer(
    transformers=[
        ('obj', obj_pipeline, obj_columns),
        ('log', log_pipeline, log_columns),
        ('bin', binary_pipeline, binary_columns)
    ], remainder=num_pipeline
)

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing)
])


X_train = pipeline.fit_transform(data)

In [None]:
plt.hist(y_train)

In [None]:
from sklearn.svm import SVR as SVR
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
models = {
    'SVM': SVR(),
    'KRidge': KernelRidge(),
    'Ridge' : Ridge(random_state=10, max_iter=50000)
}

# Define the hyperparameter grids for each model
param_grids = {
    'SVM': {
        'kernel' : ['linear', 'poly', 'sigmoid'],
        'degree' : [2, 3, 4]
    },
    'Ridge' : {
        'alpha' : [8.0, 9.0, 10.0, 11.0],
        'solver' : ['auto', 'lsqr', 'sag']
    },
    'KRidge' : {
        'alpha' : [0.1, 0.2, 0.3, 0.4],
        'gamma' : [None],
        'kernel' : ["linear", "poly", "polynomial"],
        'degree' : [1, 2, 3]
    }
}

In [None]:
# Train and tune the models
from sklearn.model_selection import GridSearchCV

grids = {}
for model_name, model in models.items():
    print(f'Training and tuning {model_name}...')
    grids[model_name] = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring='neg_root_mean_squared_error', n_jobs=-1, verbose=1)
    grids[model_name].fit(X_train, y_train.values.ravel())
    best_params = grids[model_name].best_params_
    best_score = grids[model_name].best_score_
    
    print(f'Best parameters for {model_name}: {best_params}')
    print(f'Best accuracy for {model_name}: {-best_score}\n')

In [None]:
# Estrai i valori di alpha e i punteggi della Ridge Regression
ridge_alphas = grids['KRidge'].cv_results_['param_alpha'].data  # Lista di alpha testati
ridge_scores = -grids['KRidge'].cv_results_['mean_test_score']  # Negativo perché usa RMSE (errore → positivo)

# Ordina per alpha
sorted_indices = np.argsort(ridge_alphas)

ridge_alphas = np.array(ridge_alphas)[sorted_indices]
ridge_scores = np.array(ridge_scores)[sorted_indices]

# Plot
plt.figure(figsize=(8, 5))
plt.plot(ridge_alphas, ridge_scores, marker='o', linestyle='-')
plt.xscale('log')  # Se gli alpha sono su più ordini di grandezza
plt.xlabel('Alpha')
plt.ylabel('RMSE')
plt.title('KRidge Regression RMSE')
plt.grid(True)
plt.show()

In [None]:
test_raw_data = pd.read_csv("data/test.csv")
test_data = test_raw_data[selected_features].copy()

edges = [0, 1970, 1990, 2000, 2050]
labels = ["a", "b", "c", "d"]
test_data["YearRemodAdd"] = pd.cut(test_data['YearRemodAdd'], edges, labels=labels)

edges = [0, 1900, 1950, 1980, 2000, 2010, 2050]
labels = ["a", "b", "c", "d", "e", "f"]
test_data["YearBuilt"] = pd.cut(test_data['YearBuilt'], edges, labels=labels)

edges = [0., 1.1, 2.1, 3.1, 4.1, 10.]
labels = ["a", "b", "c", "d", "e"]
test_data["BedroomAbvGr"] = pd.cut(test_data['BedroomAbvGr'], edges, labels=labels)

test_data['PoolArea'] = test_data['PoolArea'].isin([0]).astype(int)


X_test = pipeline.transform(test_data)

In [None]:
import csv

pred = grids['KRidge'].predict(X_test)

In [None]:
f = open("out.csv", "w")
f.write("Id,SalePrice")
for i in range(len(pred)):
    f.write("\n")
    f.write(str(i+1461) + "," + str(np.exp(pred[i])))
f.close()