# Gaussian Process Regression

# //TODO LIST
## Data import
## Data cleaning
### duplicate remover
### 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data Import

In [4]:
url = 'https://raw.githubusercontent.com/LeoPeink/GPRegression/refs/heads/master/sale_clean.csv'
df = pd.read_csv(url, index_col=0) #read csv from gitHub
df.reset_index(inplace=True)       #reset index to default

#Warning about columns 10 and 11. They are useless for regression, so will be removed anyway from the dataset later.
#df.head(5)

HTTPError: HTTP Error 404: Not Found

## Data Cleaning
The dataset contains a lot of duplicate rows. After careful consideration and seeing that the original dataset was scraped on immobiliare.it, I concluded that it was probably a scraping error.
In the unlikely case the duplicate rows were actual different houses with the same exact carachteristics, the adapted policy was to remove them to avoid eccessively reducing the dataset variance.

Also, the amount of features is way overkill for what we need in our didactic demo. All features except prezzo, stanze, bagni and superficie will be removed.

Finally, due to the scope of the project (house price regression), all records with missing price were also dropped.

In [None]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(df):
    # Drop duplicate rows across all columns
    df = df.drop_duplicates()
    # Drop columns: 'regione', 'citta' and 27 other columns
    df = df.drop(columns=['regione', 'citta', 'quartiere', 'datetime', 'posti auto', 'bagni per stanza', 'ultimo piano', 'stato', 'classe energetica', 'vista mare', 'riscaldamento centralizzato', 'arredato', 'balcone', 'impianto tv', 'esposizione esterna', 'fibra ottica', 'cancello elettrico', 'cantina', 'giardino comune', 'giardino privato', 'impianto allarme', 'portiere', 'piscina', 'villa', 'intera proprieta', 'appartamento', 'attico', 'loft', 'mansarda'])
    # Drop rows with missing data in every column
    df = df.dropna()
    return df

df_clean = clean_data(df.copy())
df_clean.head()

# Exploratory analysis
The single features are analized using the following framework:

## Pricing hell
The price feature has HORRIBLE values.
The majority of houses are almost for free and some houses cost more euros than the amount of sand grains on earth (10^20). The IQR strategy does not work for the low outliers here, so I decidet to do 0.05 trimming:

In [None]:
#boxplot with distribution of every feature
print('Raw price distribution:')
print(df_clean['prezzo'].describe())
plt.boxplot(df_clean['prezzo'])
plt.title('Boxplot of House Prices (raw data)')
plt.ylabel('Price (€)')
plt.show()


In [None]:
#remove outliers from prezzo using 15% and 95% quantiles
q_low = df_clean['prezzo'].quantile(0.05)
print (f"5% quantile: {q_low}")
q_high = df_clean['prezzo'].quantile(0.95)
print (f"95% quantile: {q_high}")
df_clean = df_clean[(df_clean['prezzo'] >= q_low) & (df_clean['prezzo'] <= q_high)]
#plot histogram of 'prezzo'
plt.hist(df_clean['prezzo'], bins=50, edgecolor='black')

print(df_clean['prezzo'].describe())
df_clean.head()

Very low prices are considered irrealistic, and fit well (along the very high ones we had from the start) with the bad scraping hypothesis. For our objective, dropping all the prices under 10.000€ seems reasonable.

In [None]:
df_clean = df_clean[(df_clean['prezzo'] >= 10000)]
plt.hist(df_clean['prezzo'], bins=50, edgecolor='black')


The next step in preprocessing was to apply a log scale to "Prezzo", to keep as much data as possible in the dataset for the regression. the result is a bimodal distribution.

In [None]:
df_clean['prezzo'] = np.log(df_clean['prezzo'])  #apply log transformation to 'prezzo'
#plot histogram of 'prezzo' after log transformation
plt.hist(df_clean['prezzo'], bins=50, edgecolor='black')
plt.title('Histogram of House Prices (log transformed)')
plt.xlabel('Log Price (€)')
plt.ylabel('Frequency')
plt.show()

## Bathrooms cleaning (lol)

Let's see how the bathrooms are distribuited: 

In [None]:
#check the bagni distribution
print('Bathrooms distribution:')
print(df_clean['bagni'].describe())
plt.hist(df_clean['bagni'], bins=30, edgecolor='black')
plt.title('Histogram of Bathrooms')
plt.xlabel('Number of Bathrooms')
plt.ylabel('Frequency')
plt.show()
df_clean.head()
print(df_clean.describe())

#how to check if missing rooms are correlated with low superfice?
#check the superfice distribution
print('Superfice distribution:')
print(df_clean['superficie'].describe())
plt.hist(df_clean['superficie'], bins=50, edgecolor='black')
plt.title('Histogram of Superfice')
plt.xlabel('Superfice (m^2)')
plt.ylabel('Frequency')
plt.show()


## Rooms time!
Let's now see how the rooms are distributed:


In [None]:
#check the rooms distribution
print('Rooms distribution:')
print(df_clean['stanze'].describe())
plt.hist(df_clean['stanze'], bins=5, edgecolor='black')
plt.title('Histogram of rooms')
plt.xlabel('Number of rooms')
plt.ylabel('Frequency')
plt.show()
df_clean.head()
print(df_clean.describe())

## Correlation analysis
Let's see if our variables are correlated. I expected stronger correlations given the real estate nature of the dataset. The low correlation between prezzo and superficie might be explained by the removed geographical information of our data. Still, given the HORRORS i saw in the original dataset's distribution, I feel happy anyway.

In [None]:
#correlation matrix with values inside
corr = df_clean[['prezzo', 'superficie', 'stanze', 'bagni']].corr()
print('Correlation matrix:')
plt.matshow(corr, cmap='coolwarm')
plt.xticks(range(len(corr.columns)), corr.columns, rotation=45)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.colorbar()

# Add correlation values to each square
for i in range(len(corr.columns)):
    for j in range(len(corr.columns)):
        plt.text(j, i, f'{corr.iloc[i, j]:.2f}', 
                ha='center', va='center', fontsize=10, fontweight='bold')

plt.title('Correlation Matrix', pad=20)
plt.show()

print(corr)

# Regression analysis
The goal is now to predict prezzo (dependent variable) using bagni, stanze and superficie as features (independent variables).


### Train-test split
I proceed to split the dataset in train and test.
We will later use K-fold cross validation on the training set.

In [None]:
#import standard scaler
from sklearn.preprocessing import StandardScaler
#import personal library from local folder
from LPEG import lpeg_regressions as lpr
from LPEG import lpeg_preprocessing as lpp

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X = df_clean[['superficie', 'stanze', 'bagni']]
y = df_clean['prezzo']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) 
X_test = scaler.transform(X_test) #NB: do not fit the scaler on the test set to avoid data leakage



lpp.add_bias_term(X_train)
lpp.add_bias_term(X_test)


In [None]:
#w = lpr.closed_form_linear_regression(X_train, y_train)
ws,losses = lpr.gradientDescent(lpr.squaredLossGradient, lpr.squaredLoss, X_train, y_train, lam = 0,w_0=None, alpha=0.1, t_max=1000, tol=1e-15, fixed_alpha=True)

y_pred_train = X_train @ ws

np.mean((y_train - y_pred_train)**2)


plt.plot(losses)
