# Class Pratical Assigment
#### 28.03.2022

Predicting Housing Prices for regions in the USA. The data contains the following columns:
 - 'Avg. Area Income': Avg. Income of residents of the city house is located in;
 - 'Avg. Area House Age': Avg Age of Houses in same city;
 - 'Avg. Area Number of Rooms': Avg Number of Rooms for Houses in same city;
 - 'Avg. Area Number of Bedrooms': Avg Number of Bedrooms for Houses in same city;
 - 'Area Population': Population of city house is located in;
 - 'Price': Price that the house sold at;
 - 'Address': Address for the house.

### ML Algorithms used:
 - Linear regression;
 - Decision Trees;
 - Decision Trees - Bagging;
 - Random Forest;
 - XGBoost;


### References: 
 - https://www.kaggle.com/code/faressayah/practical-introduction-to-10-regression-algorithm#%F0%9F%93%8A-Models-Comparison
 - https://github.com/TheAIFramework/PracticalMachineLearning/blob/Trees/4%20-%20Ensemble%20Methods.ipynb
 - https://github.com/TheAIFramework/PracticalMachineLearning/blob/Trees/3%20-%20Decision%20Trees.ipynb
 - https://github.com/TheAIFramework/PracticalMachineLearning/tree/gradient

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('Housing_Data.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

## Data Analysis and Cleaning

In [None]:
# Encontra valores nulos no dataset
df.isna().sum()

In [None]:
# Tamanho do dataset
print('Nº linhas:', df.shape[0])
print('Nº colunas:', df.shape[1])

'Address' column doesnt significantly add something to the dataset, thus is fully dropped

In [None]:
df = df.drop(['Address'], axis=1)

In [None]:
df.head(10)

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
def box_plotting(column_name):
    print("\n - ", column_name)
    # Gera gráfico caixa de bigodes
    fig = plt.figure()
    sns.boxplot(df[column_name])

    # Calcular os quartis e o indice quartil
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    IQR = q3 - q1
    median = df[column_name].median()
    print("Quartil 1: ", q1)
    print("Mediana: ", median)
    print("Quartil 3: ", q3)
    print("Interquantil Range (IQR): ", IQR)

    # Cacular os limites min/max para calcular outliers
    lower = q1 - 1.5*IQR
    upper = q3 + 1.5*IQR
    print("Limite Superior:", upper)
    print("Limite Inferior:", lower)
        
    clean_data = df[~((df[column_name] < lower) |(df[column_name] > upper))]
    print("Qtd de outliers:", df.shape[0] - clean_data.shape[0])
    
    return {'lower': lower, 'upper': upper}

In [None]:
# Check for outliers in dataset
columns = df.columns
for col in columns:
    box_plotting(col)
#     df = df[~((df[col] < o['lower']) |(df[col] > o['upper']))]

In [None]:
plt.rcParams['figure.figsize'] = (15, 8)
sns.heatmap(df.corr(), cmap = 'Wistia', annot = True)
plt.title('Correlation Matrix', fontsize = 20)
plt.show()

Seems to exist considerable correlation between all features and our target

In [None]:
df.shape

## Training

In [None]:
# standardize data
from sklearn.preprocessing import MinMaxScaler
norm_scale = MinMaxScaler().fit_transform(df)
df = pd.DataFrame(norm_scale,columns=df.columns)
print(df.head())

In [None]:
df

In [None]:
# Target
y = df['Price']

In [None]:
y

In [None]:
# Features
X = df
# drop one column by name
X.drop('Price', axis=1, inplace=True)

In [None]:
X

In [None]:
# 80/20 training and testing splitting - Pareto principle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

In [None]:
lr_pred

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def evaluate(tt, pred):
    mae = mean_absolute_error(tt, pred)
    mse = mean_squared_error(tt, pred)
    rmse = mean_squared_error(tt, pred, squared=False)
    print("MAE:", mae)
    print("MSE:", mse)
    print("RMSE:", rmse)
    return {'mae': mae, 'mse': mse, 'rmse': rmse}

In [None]:
lr_score = evaluate(y_test, lr_pred)

In [None]:
lr_res = pd.DataFrame({
    'Model': ['Linear Regression'], 
    'Train MAE': None, 
    'Train RMSE': None, 
    'Test MAE': lr_score['mae'], 
    'Test RMSE': lr_score['rmse'] 
})

In [None]:
lr_res

In [None]:
plt.scatter(y_test, lr_pred)

##  Decision Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
dtr_model = DecisionTreeRegressor(max_depth=15, min_samples_leaf=20) 
dtr_model.fit(X_train, y_train)

dtr_pred_train = dtr_model.predict(X_train)
evaluate(y_train, dtr_pred_train)

In [None]:
dtr_pred_test = dtr_model.predict(X_test)
evaluate(y_test, dtr_pred_test)

In [None]:
# lets try some Grid Search

param = {'splitter':('best', 'random'), 
         'max_depth' : [1,2,3,4,5,10], 
         'min_samples_leaf' : [1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35]}

dtr_gs = GridSearchCV(DecisionTreeRegressor(), param)

In [None]:
dtr_gs.fit(X_train, y_train)

In [None]:
dtr_gs.best_estimator_

In [None]:
dtr_pred_train = dtr_gs.predict(X_train)
dtr_train_score = evaluate(y_train, dtr_pred_train)

In [None]:
dtr_pred_test = dtr_gs.predict(X_test)
dtr_test_score = evaluate(y_test, dtr_pred_test)

In [None]:
dtr_res = pd.DataFrame({
    'Model': ['Decision Trees Regressor'], 
    'Train MAE': dtr_train_score['mae'], 
    'Train RMSE': dtr_train_score['rmse'], 
    'Test MAE': dtr_test_score['mae'], 
    'Test RMSE': dtr_test_score['rmse'] 
})

r1 = lr_res.append(dtr_res, ignore_index=True)

In [None]:
r1

##  Bagging - Decision Trees

In [None]:
from sklearn.ensemble import BaggingClassifier, BaggingRegressor

In [None]:
bag_dtr_model = BaggingRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=10, min_samples_leaf=3),
    verbose=1,
    n_jobs=-1
)

bag_dtr_model.fit(X_train, y_train)

bag_dtr_pred_train = bag_dtr_model.predict(X_train)
bag_dtr_train_score = evaluate(y_train, bag_dtr_pred_train)

In [None]:
bag_dtr_pred_test = bag_dtr_model.predict(X_test)
bag_dtr_test_score = evaluate(y_test, bag_dtr_pred_test)

In [None]:
bag_dtr_res = pd.DataFrame({
    'Model': ['Decision Trees Regressor - Bagging'], 
    'Train MAE': bag_dtr_train_score['mae'], 
    'Train RMSE': bag_dtr_train_score['rmse'], 
    'Test MAE': bag_dtr_test_score['mae'], 
    'Test RMSE': bag_dtr_test_score['rmse'] 
})

r2 = r1.append(bag_dtr_res, ignore_index=True)

In [None]:
r2

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr_model = RandomForestRegressor(max_depth=10, n_estimators=3, verbose=1, n_jobs=-1)

rfr_model.fit(X_train, y_train)

rfr_pred_train = rfr_model.predict(X_train)
rfr_train_score = evaluate(y_train, rfr_pred_train)

In [None]:
rfr_pred_test = rfr_model.predict(X_test)
rfr_test_score = evaluate(y_train, rfr_pred_train)

In [None]:
# lets try some Grid Search

param = {'max_depth' : [1,2,3,4,5,10], 
         'min_samples_leaf' : [2, 3, 4, 5, 10, 15, 20]}

rfr_gs = GridSearchCV(RandomForestRegressor(), param)

In [None]:
rfr_gs.fit(X_train, y_train)

In [None]:
rfr_gs.best_estimator_

In [None]:
rfr_gs_pred_train = rfr_gs.predict(X_train)
rfr_gs_train_score = evaluate(y_train, rfr_gs_pred_train)

In [None]:
rfr_gs_pred_test = rfr_gs.predict(X_test)
rfr_gs_test_score = evaluate(y_test, rfr_gs_pred_test)

## XGBOOST

References:

- https://machinelearningmastery.com/xgboost-for-regression/
- https://machinelearningmastery.com/gentle-introduction-gradient-boosting-algorithm-machine-learning/

In [None]:
from xgboost import XGBRegressor
import xgboost as xgb

Xgboost regressor - The most commonly configured hyperparameters are the following:

- n_estimators: The number of trees in the ensemble, often increased until no further improvements are seen.
- max_depth: The maximum depth of each tree, often values are between 1 and 10.
- eta: The learning rate used to weight each model, often set to small values such as 0.3, 0.1, 0.01, or smaller.
- subsample: The number of samples (rows) used in each tree, set to a value between 0 and 1, often 1.0 to use all samples.
- colsample_bytree: Number of features (columns) used in each tree, set to a value between 0 and 1, often 1.0 to use all features.

In [None]:
# create an xgboost regression model
xgbr_model = XGBRegressor(n_estimators=1000, max_depth=5, eta=0.01)

eval_set = [(X_test, y_test)]
xgbr_model.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=True)

In [None]:
xgbr_pred_train = xgbr_model.predict(X_train)
xgbr_train_score = evaluate(y_train, xgbr_pred_train)

In [None]:
xgbr_pred_test = xgbr_model.predict(X_test)
xgbr_test_score = evaluate(y_test, xgbr_pred_test)

In [None]:
param = {'max_depth' : [1,2,3,4,5,10], 
         'min_samples_leaf' : [2, 3, 4, 5, 10, 15, 20],
         'eta': [0.01, 0.1, 0.3, 0.2]}

xgbr_gs = GridSearchCV(XGBRegressor(), param)

In [None]:
xgbr_gs.fit(X_train, y_train, early_stopping_rounds=10, eval_set=eval_set, verbose=True)

In [None]:
xgbr_gs.best_estimator_

In [None]:
xgbr_gs_pred_train = xgbr_gs.predict(X_train)
xgbr_gs_train_score = evaluate(y_train, xgbr_gs_pred_train)

In [None]:
xgbr_gs_pred_test = xgbr_gs.predict(X_test)
xgbr_gs_test_score = evaluate(y_test, xgbr_gs_pred_test)

#### [Testing some cross validation]

In [None]:
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
model = XGBRegressor(max_depth=4, n_estimators=100)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)

In [None]:
scores

In [None]:
scores.mean()

## SVM Regressor

In [None]:
from sklearn.svm import SVR

In [None]:
svr_model = SVR(kernel='rbf', C=1000000, epsilon=0.001)
svr_model.fit(X_train, y_train)

svr_pred_train = svr_model.predict(X_train)
svr_train_score = evaluate(y_train, svr_pred_train)

In [None]:
svr_pred_test = svr_model.predict(X_test)
svr_test_score = evaluate(y_test, svr_pred_test)