**MLP for Regression**

A linear regression model has the common form: $y=\alpha + \sum\limits_{i=1}^{n}\beta_i \cdot x_i$

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.utils import plot_model

## Load data

In [None]:
rawdata = pd.read_csv("C:\Code\KI\Moodle\RealEstate (1).csv")
rawdata["ART"] = pd.Categorical(rawdata.ART, ['Flat', 'Attached', 'House'])
rawdata = rawdata.iloc[:, np.r_[6, 0:6, 7:8]]
rawdata.head()

## Exploratory data analysis

### Correlation matrix

In [None]:
sns.set_theme(style="white")

# Compute the correlation matrix
corr = rawdata.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1.0, vmax=1.0, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

## Data preprocessing
### Missing values

In [None]:
# Check missing values
#print(rawdata.info())
print(rawdata.isnull().sum())

# Dealing with missing values
# (1) Deleting the column with missing values
# rawdata = rawdata.dropna(axis=1)

# (2) Deleting the row with missing values
# rawdata = rawdata.dropna(axis=0)

# (3) Imputation - filling the missing values
#rawdata['PRICE'] = rawdata['PRICE'].fillna(rawdata['PRICE'].median())

### Outliers

In [None]:
# Detect outliers
# Pandas boxplot
rawdata.boxplot(grid=False)
#rawdata.boxplot(column=['PRICE'], grid=False)
# Seaborn boxplot
sns.set_style("whitegrid")
sns.boxplot(data=rawdata['PRICE'])

# Dealing with outliers
# Interquartile range (IQR)
Q1 = rawdata['PRICE'].quantile(q=0.25, interpolation='midpoint') # Q1 = np.percentile(rawdata['PRICE'], q=25, method='midpoint')
Q3 = rawdata['PRICE'].quantile(q=0.75, interpolation='midpoint') # Q3 = np.percentile(rawdata['PRICE'], q=75, method='midpoint')
IQR = Q3 - Q1
print('Interquartile range: ', round(IQR, 2))
# Upper bound
upper_bound = Q3 + 1.5 * IQR
upper= np.array(rawdata['PRICE'] > upper_bound)
print('Sum of upper values: ', upper.sum())
# Lower bound
lower_bound = Q1 - 1.5 * IQR
lower = np.array(rawdata['PRICE'] < lower_bound)
print('Sum of lower values: ', lower.sum())
# Setting outliers to boundaries
rawdata.loc[rawdata['PRICE'] > upper_bound, 'PRICE'] = upper_bound
rawdata.loc[rawdata['PRICE'] < lower_bound, 'PRICE'] = lower_bound

### Dummification

In [None]:
rawdata = pd.get_dummies(rawdata, columns=["ART"], drop_first=True)
rawdata.head()

### Partition into train and test sets

In [None]:
train, test = train_test_split(rawdata.to_numpy(), shuffle=True)

### Scale data sets

In [None]:
# Extract X and y
train_X, train_y = train[:, 1:], train[:, 0].reshape(-1, 1)
test_X, test_y = test[:, 1:], test[:, 0].reshape(-1, 1)

# Fit a scaler on the training set and distinguish between X and y fits for later inverse transformations
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
train_scaled_X = scaler_X.fit_transform(train_X)
train_scaled_y = scaler_y.fit_transform(train_y)
# Apply the same scalers with learned parameters from training set on test set
test_scaled_X = scaler_X.transform(test_X)
test_scaled_y = scaler_y.transform(test_y)

## Neural Network Model
### Extract X and y

In [None]:
def nunits(a):
    """Number of units (features or outcomes)

    Arguments
    ---------
    a : array-like
        A ndarray.

    Returns
    -------
        Number of units : int

    """
    if len(a.shape) == 1:
        a = a.reshape(-1, 1)
    return a.shape[-1]

num_X = nunits(train_scaled_X)
num_y = nunits(train_scaled_y)
print("Number of features: {0} and number of outcomes: {1}".format(num_X, num_y))

### Model

In [None]:
# Input layer
input = Input(shape=(num_X,))
# Hidden layers
hidden = Dense(units=128, activation='relu')(input)
hidden = Dense(units=64, activation='relu')(hidden)
hidden = Dense(units=32, activation='relu')(hidden)
hidden = Dense(units=16, activation='relu')(hidden)
# Output layer
output = Dense(units=num_y, activation='linear')(hidden)

# Entire model
model = Model(inputs=input, outputs=output, name='Regression')

# Summarize layers
print(model.summary())

# Plot model
plot_model(model)

In [None]:
# Configuration of the training process
model.compile(loss='mse', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Fit model
model.fit(x=train_scaled_X, y=train_scaled_y, epochs=50, batch_size=1)

## Predictions
### Computations

In [None]:
# from deepann.metrics import rmse
def rmse(actuals, preds):
    """Root mean squared error (RMSE)

    Arguments
    ---------
        actuals : vector-like, e.g. list, tupel, array
            A numeric vector with actual values.
        preds : vector-like, e.g. list, tupel, array
            A numeric vector with predicted values.

    Returns
    -------
       Root mean squared error : numpy.float
    """
    actuals, preds = np.array(actuals), np.array(preds)
    error = actuals - preds
    mse = np.mean(error**2)
    return np.sqrt(mse)

# In-sample and out-of-sample predictions
train_yhat = model.predict(train_scaled_X)
test_yhat = model.predict(test_scaled_X)
# Inverse transforming because of scaled values
train_rescaled_yhat = scaler_y.inverse_transform(train_yhat)
test_rescaled_yhat = scaler_y.inverse_transform(test_yhat)
# Compute RMSE
rmse_train = round(rmse(train_y, train_rescaled_yhat), 2)
rmse_test = round(rmse(test_y, test_rescaled_yhat), 2)
print("RMSE: Train = {0}, test = {1}".format(rmse_train, rmse_test))

### Visualizations

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
# In-sample plot
axes[0].plot(train_y, label='Actual', linestyle='solid', color='darkblue')
axes[0].plot(train_rescaled_yhat, label='Predicted', linestyle='solid', color='darkred')
axes[0].set_title("In-sample")
axes[0].set_ylabel('PRICE')
axes[0].legend()
# Out-of-sample plot
axes[1].plot(test_y, label='Actual', linestyle='solid', color='darkgreen')
axes[1].plot(test_rescaled_yhat, label='Predicted', linestyle='solid', color='orange')
axes[1].set_title("Out-of-sample")
axes[1].legend()
fig.show()

## Regression analysis

Preferred python packages are [NumPy](https://numpy.org/), [scikit-learn](https://scikit-learn.org/) and [statsmodels](https://www.statsmodels.org/)

In [None]:
# Fit regression model
regmodel = LinearRegression().fit(X=train_X, y=train_y)

# Model parameters
r_sq = round(regmodel.score(train_X, train_y), 2)
print(f"Coefficient of determination: {r_sq}")
print(f"Intercept: {[round(item, 2) for item in regmodel.intercept_.reshape(-1)]}")
coefficients = dict(zip(rawdata.columns.to_list()[1:], [round(item, 2) for item in regmodel.coef_.reshape(-1)]))
print(f"Coefficients: {coefficients}")

# Predict response
yhat = regmodel.predict(train_X)
print(f"RMSE: {round(rmse(train_y, yhat), 2)}")