In [None]:
# In this notebook we will try to solve the prolem of overfitting by using L1 and L2 regularization technique.
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv('D:\Coding\melbourne_housing_analisi\Melbourne_housing.csv')
df

In [None]:
df.nunique()

In [None]:
df.shape

In [None]:
# now lets observe our data and try to drop some columns that are not very uselful in our data analysis. I am just dropping some columns
# like date, latitute etc that are not very meaningful for our analysis. 
columns_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 'Bedroom', 'Bathroom', 'Postcode','Car', 'Price']
df_new = df[columns_to_use]
df_new

In [None]:
df_new.shape

In [None]:
# now lets do some data cleaning
df_new.isna().sum()

In [None]:
# so we have several columns with NaN values so we need to handle these columns. We can actually fill some of these column's NaN 
# values just by 0 and some other columns might need some other treatment based on their nature for example price.
# lets first handle the columns where we need to fill only 0.

columns_to_fill_0 = ['Car', 'Bathroom', 'Bedroom']
df_new[columns_to_fill_0] = df[columns_to_fill_0].fillna(0)
df_new = df_new.replace([np.inf, -np.inf], np.nan).dropna()
df_new.isna().sum()

In [None]:
# Now lets fill the columns named landsize and building area with mean of the whole respective columns
df_new = df_new.replace([np.inf, -np.inf], np.nan).dropna()
# in the buildingArea column there are some infitly large valuea and the model was not training because of that that is why 
# I had to come back here and drop those inf values as well.


In [None]:
df_new.isna().sum()

In [None]:
# now we are good to go with out cleaned data. Now we are going to make dummy variables for our whole dataset.
df_new = pd.get_dummies(df_new, drop_first=True) # it is a short cut to avoid dummy variable trap it is just dropping the main column whose dummies we have produced. 
df_new

In [None]:
x = df_new.drop('Price', axis='columns')
y = df_new.Price

In [None]:
#rimozione outlier
numeric_cols = df.select_dtypes(include=["float", "int"]).columns.tolist()

for col in numeric_cols:
    Q1 = df[col].quantile(0.2)
    Q3 = df[col].quantile(0.8)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

    df[col] = np.where(df[col].between(lower, upper), df[col], np.nan)

df = df.dropna()

In [None]:
from sklearn.discriminant_analysis import StandardScaler


scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
# Now we can jump into our machine learning model and lets first use the train_test_split method
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, train_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

model = LogisticRegression(solver='liblinear')
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_val_score(model, x_train, y_train, cv=cv, scoring='f1')

print("F1 Scores:", scores)
print("Average F1:", scores.mean())

model.fit(x_train, y_train)


In [None]:
model.score(x_test, y_test)
# Our model is much overfit with the training dataset that its accuracy in negative when we provide it with testing dataset. 

In [None]:
model.score(x_train, y_train) # at the same our model is performing very well with respect to the training datset

In [None]:
from sklearn.linear_model import Lasso


for a in [0.1, 1, 10, 50]:
    model = Lasso(alpha=a)
    model.fit(x_train, y_train)
    print(a, model.score(x_test, y_test))
    print(a, model.score(x_train, y_train))


In [None]:
# Now we will use the L2 regularization tehnique
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(x_train, y_train)

In [None]:
ridge_model.score(x_test,y_test)
# after using L2 regularization our model is also much better but it seems that L1 regularization is slightly better then L2 in this case.

In [None]:
ridge_model.score(x_train,y_train)

In [None]:
from sklearn.linear_model import ElasticNet


ela = ElasticNet(alpha=0.1, l1_ratio=0.5)
ela.fit(x_train, y_train)
ela.score(x_test, y_test)
