In [336]:
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,6)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_squared_error

In [337]:
df = pd.read_csv('/content/drive/MyDrive/Facultate/year1-sem2/big-data/data/cleaned_data.csv', parse_dates=['Date'])
df.head(2)

Unnamed: 0,Price,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,Regionname,Propertycount
0,1480000,Abbotsford,85 Turner St,2,h,S,Biggin,2016-12-03,2.5,3067,...,1,1,202.0,152.826939,1966.516327,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019
1,1035000,Abbotsford,25 Bloomburg St,2,h,S,Biggin,2016-02-04,2.5,3067,...,1,0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019


# Dropping useless features and convertind date to UNIX

In [338]:
bad_columns = [
  "Address",
  "Latitude",
  "Longitude",
  "Propertycount"
]

In [339]:
df['Date'] = (df['Date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [340]:
df.drop(columns=bad_columns, axis=1, inplace=True)
df.head(2)

Unnamed: 0,Price,Suburb,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Regionname
0,1480000,Abbotsford,2,h,S,Biggin,1480723200,2.5,3067,2,1,1,202.0,152.826939,1966.516327,Yarra City Council,Northern Metropolitan
1,1035000,Abbotsford,2,h,S,Biggin,1454544000,2.5,3067,2,1,0,156.0,79.0,1900.0,Yarra City Council,Northern Metropolitan


# One hot encoding for categorical features

In [341]:
ohc_columns = [
  "Suburb",
  "Postcode",
  "Type",
  "Method",
  "SellerG",
  "CouncilArea",
  "Regionname"
]
df = pd.get_dummies(df, columns=ohc_columns)
df

Unnamed: 0,Price,Rooms,Date,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,...,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Metropolitan,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
0,1480000,2,1480723200,2.5,2,1,1,202.000000,152.826939,1966.516327,...,1,0,0,0,1,0,0,0,0,0
1,1035000,2,1454544000,2.5,2,1,0,156.000000,79.000000,1900.000000,...,1,0,0,0,1,0,0,0,0,0
2,1465000,3,1488585600,2.5,3,2,0,134.000000,150.000000,1900.000000,...,1,0,0,0,1,0,0,0,0,0
3,850000,3,1488585600,2.5,3,2,1,94.000000,152.826939,1966.516327,...,1,0,0,0,1,0,0,0,0,0
4,1600000,4,1464998400,2.5,3,1,2,120.000000,142.000000,2014.000000,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27160,1480000,4,1519430400,6.3,4,1,3,593.000000,152.826939,1966.516327,...,0,0,0,0,0,0,0,0,1,0
27161,888000,2,1519430400,6.3,2,2,1,98.000000,104.000000,2018.000000,...,0,0,0,0,0,0,0,0,1,0
27162,705000,2,1519430400,6.3,2,1,2,220.000000,120.000000,2000.000000,...,0,0,0,0,0,0,0,0,1,0
27163,1140000,3,1519430400,6.3,3,1,2,490.855698,152.826939,1966.516327,...,0,0,0,0,0,0,0,0,1,0


# Defining method for model comparison

In [342]:
models = [
  Lasso(), 
  Ridge(), 
  SVR(),
  MLPRegressor(max_iter=500)
]
evaluation_df = pd.DataFrame(columns=['Decomposition method'] + models)

In [343]:
def run_models(data, target, models, decomposition_method="N/A"):
  scores = [decomposition_method]
  X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1, shuffle=False)

  for model in models:
    print(f'Training model {type(model).__name__}')
    regressor = model.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    scores.append(mean_squared_error(y_test, y_pred))
  return pd.DataFrame([scores], columns=['Decomposition method'] + models)

# PCA decomposition: check this link for more info on how to choose n_components
https://www.mikulskibartosz.name/pca-how-to-choose-the-number-of-components/#:~:text=in%20understanding%20PCA.-,Short%20answer,explained%20by%20the%20generated%20components.

In [344]:
scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)
scaled_df

array([[0.12550607, 0.06666667, 0.39794608, ..., 0.        , 0.        ,
        0.        ],
       [0.08547009, 0.06666667, 0.00898588, ..., 0.        , 0.        ,
        0.        ],
       [0.12415655, 0.13333333, 0.51476252, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.05578048, 0.06666667, 0.97304236, ..., 0.        , 1.        ,
        0.        ],
       [0.09491678, 0.13333333, 0.97304236, ..., 0.        , 1.        ,
        0.        ],
       [0.08412056, 0.06666667, 0.97304236, ..., 0.        , 1.        ,
        0.        ]])

In [345]:
pca = PCA(n_components = 0.95)
scaled_df = pca.fit_transform(scaled_df)
scaled_df.shape
# from 956 features to 222

(27165, 222)

In [346]:
X = scaled_df[:, 1:]
y = scaled_df[:, 0]

In [347]:
pca_result_row = run_models(X, y, models, decomposition_method="PCA")
evaluation_df = evaluation_df.append(pca_result_row, ignore_index=True)
evaluation_df

Training model Lasso
Training model Ridge
Training model SVR
Training model MLPRegressor


Unnamed: 0,Decomposition method,Lasso(),Ridge(),SVR(),MLPRegressor(max_iter=500)
0,PCA,0.400925,0.451087,0.008059,0.000484


# Next decomposition methods same names