In [None]:
# Improvements:
# Replace msising values with median or average and then see the impact on the metrics.
# Adding categerocial data

In [None]:
# Group 3

In [29]:
import pandas as pd
import numpy as np
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [36]:
# load dataset
df = pd.read_csv('/Users/muhammadraza/Documents/GitHub/BIPM/Data Science/data/Dataset_Melbourne.csv', index_col=None)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Address        34857 non-null  object 
 2   Rooms          34857 non-null  int64  
 3   Type           34857 non-null  object 
 4   Price          27247 non-null  float64
 5   Method         34857 non-null  object 
 6   SellerG        34857 non-null  object 
 7   Date           34857 non-null  object 
 8   Distance       34856 non-null  float64
 9   Postcode       34856 non-null  float64
 10  Bedroom2       26640 non-null  float64
 11  Bathroom       26631 non-null  float64
 12  Car            26129 non-null  float64
 13  Landsize       23047 non-null  float64
 14  BuildingArea   13742 non-null  float64
 15  YearBuilt      15551 non-null  float64
 16  CouncilArea    34854 non-null  object 
 17  Lattitude      26881 non-null  float64
 18  Longti

In [None]:
# Some features are missing for a lot of the data.
# Most are floats and some are objects

In [37]:
from sklearn import set_config
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

## Instantiating the Linear Regression

lm = LinearRegression()
set_config(transform_output="pandas")

X = df[['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt']]
y = df['Price']



In [38]:
imputer_median = SimpleImputer(strategy='median', add_indicator=True)
imputer_median.fit(np.array(df['Price']).reshape(-1, 1))  # Impute missing values in y
df[['Price', 'Imputer Price']] = imputer_median.transform(np.array(df['Price']).reshape(-1, 1))

y = df['Price']


In [39]:
# Stratify was taken out since its not relevant to regression.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

ct = ColumnTransformer(
    transformers=[
    ('imputer_x', imputer_median, ['Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Distance'])
    ],
    remainder='passthrough'
)

pipe_lm = Pipeline([
    ('preprocessor', ct),
    ('classifier', lm)]
)

pipe_lm.fit(X_train, y_train)

predictions = pipe_lm.predict(X_test)

# EVALUATION METRICS
# ==================

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print("MAE:", mean_absolute_error(y_test, predictions))
print("RMSE:", mean_squared_error(y_test, predictions, squared=False))

MAE: 319031.1196573668
RMSE: 474822.6324385042


In [42]:
## RegressionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor()

pipe_dtr = Pipeline([
    ('preprocessor', ct),
    ('classifier', dtr)]
)

pipe_dtr.fit(X_train, y_train)
predictions = pipe_dtr.predict(X_test)

print("MAE:", mean_absolute_error(y_test, predictions))
print("RMSE:", mean_squared_error(y_test, predictions, squared=False))


MAE: 339198.2873858612
RMSE: 577957.5173757244


In [44]:
## RandomForestRegressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

pipe_rf = Pipeline([
    ('preprocessor', ct),
    ('classifier', rf)]
)

pipe_rf.fit(X_train, y_train)
predictions = pipe_rf.predict(X_test)

print("MAE:", mean_absolute_error(y_test, predictions))
print("RMSE:", mean_squared_error(y_test, predictions, squared=False))


MAE: 282066.30547410774
RMSE: 455244.8136687109


In [49]:
from sklearn.model_selection import GridSearchCV

params = {
    "classifier__max_depth": range(1, 12),
    "classifier__criterion": ['absolute_error', 'friedman_mse', 'poisson', 'squared_error']
}

opt_rf = GridSearchCV(pipe_rf, params, scoring = 'accuracy', n_jobs = -1, cv = 10, verbose=1, return_train_score=True)

In [50]:
opt_rf.fit(X_train, y_train)
# predictions = opt_rf.predict(X_test)
# print("MAE:", mean_absolute_error(y_test, predictions))
# print("RMSE:", mean_squared_error(y_test, predictions, squared=False))

Fitting 10 folds for each of 44 candidates, totalling 440 fits
