# Artificial Neural Network

This Notebook aims to build a Artificial Neural Network (NN) in order to beat a Ordinary Least Squares Regression (OLS).

The comparison variable shall be the Mean Average Percentage Error (MAPE).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import scipy.sparse
from scipy.spatial.distance import cdist
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from pandas.api.types import is_numeric_dtype
from datetime import datetime
from MLP import MLP

In [2]:
na = '<NA>'

df = pd.read_csv(r'../data_file/selected_data.csv')
df = df.replace(na, np.nan).drop(columns = [df.columns[0]])

obj_type, rooms, surface, zip_code_2_digits, zip_code_3_digits, canton, year_built, year_renovated, distance_to_station, lon, lat, price_square_metres, price = df.columns

## Helper Functions

- frame_to_numeric aims to force each string-formated value to numeric

In [3]:
def frame_to_numeric(frame):
    """transforms every value of a data frame to numeric if possible"""
    for column in frame.columns:
        try:
            frame.loc[:, column] = pd.to_numeric(frame.loc[:, column])
        except:
            continue #nothing toDo here
    return frame

df = frame_to_numeric(df)

- add_nearest_rooms_by_surface & add_surface_mean_by_nearest_rooms aims to fill missing surface and rooms values in order to have a larger dataset to train the LRM

In [4]:
def rooms_surface_mean(df):
    df = frame_to_numeric(df.loc[:, [rooms, surface]].dropna())
    return df.groupby(rooms).mean()

def add_rooms(row, df_rooms_mean):
    return df_rooms_mean.loc[row[rooms]]

def add_nearest_rooms_by_surface(df):
    """completes the dataframe rooms by the nearest neighbour based on the surface mean distance
    and the surface of the missing rooms objects"""
    df = df.reset_index(drop = True)
    df_rooms_mean = rooms_surface_mean(df)
    df_rooms_missing = df[df.loc[:, rooms].isna()].loc[:, [surface]]
    dist = cdist(df_rooms_missing, df_rooms_mean) #computes the distance between each pair of surface
    idx = np.argsort(dist) #gets sorted index (most left = lowest distance index)
    rooms_class = idx[:,0] #gets the room class (index)
    df_rooms_missing.loc[:, rooms] = rooms_class
    df_rooms_mean = df_rooms_mean.reset_index()
    df_rooms_missing.loc[:, rooms] = df_rooms_missing.apply(lambda row: add_rooms(row, df_rooms_mean), axis = 1)
    df.loc[df_rooms_missing.index, rooms] = df_rooms_missing.loc[:, rooms] #appends the missing rooms to the real data-frame
    return df

In [5]:
def add_surface(row, df_rooms_mean):
    return df_rooms_mean.loc[row[surface]]

def add_surface_mean_by_nearest_rooms(df):
    """completes the dataframe surface by the nearest neighbour surface mean based on the room distance
    of the missing surface objects"""
    df = df.reset_index(drop = True)
    df_rooms_mean = rooms_surface_mean(df).reset_index()
    df_surface_missing = df[df.loc[:, surface].isna()].loc[:, [rooms]]
    dist = cdist(df_surface_missing, df_rooms_mean.loc[:, [rooms]]) #computes the distance between each pair of rooms
    idx = np.argsort(dist) #gets sorted index (most left = lowest distance index)
    surface_class = idx[:,0] #gets the surface class (index)
    df_surface_missing.loc[:, surface] = surface_class
    df_surface_missing.loc[:, surface] = df_surface_missing.apply(lambda row: add_surface(row, df_rooms_mean), axis = 1)
    df.loc[df_surface_missing.index, surface] = df_surface_missing.loc[:, surface]
    return df

- handle_missing_numeric_feature completes missing values of a dataframe with an additional feature which has a value 1 if the value was not missing and 0 if it was

In [6]:
def handle_missing_numeric_feature(df, feature, na):
    """this function completes the missing vlaues of a data-frame with the median of the given numeric feature
    it adds a new column `feature`+'_'+`na` at the end of the feature index,
    which is going to be '1' if the value of the data object is missing and '0' otherwise 
    """
    df_copy = df.copy()
    na_index = pd.Index(df_copy.columns).get_loc(feature) + 1
    column = df_copy.loc[:, feature]
    column = column.replace(np.nan, na, regex = True)
    artificial_column = [1. if x == na else 0. for x in column] #comprehensions
    df.insert(na_index, feature+'_'+na, artificial_column)
    
    na_indexes = column[column == na].index
    feature_median = df_copy.loc[:, feature].median()
    df.loc[na_indexes, feature] = feature_median
    return df

def missing_numerical_features_controller(df, features, na):
    """this function just calls the :func:`handle_missing_numeric_feature` for each given feature"""
    for feature in features:
        df = handle_missing_numeric_feature(df, feature, na)
    return df

- one_hot_encoding transformes catecorical features to numeric with the one hot encoding method because the NN model just works with numerical features

In [7]:
def one_hot_encoding(df, na, feature_name, new_encoding = False):
    """performs a complete one-hit encoding on the specified feature of the dataframe
    adds a the prefix '_' to each unique encoding class
    set `new_encoding` = True if the data are not in the same order as before!!! otherwise it returns a false encoding
    separates automatically all missing values as it's own category named: feature_name+'_'+na
    returns: 
    -encoded data-frame
    -categories of the variable as an array
    """
    enc = OneHotEncoder()
    df = df.copy()
    delimiter = feature_name + '_'
    na_delimiter = '_' + na
    enc_dir = './enc'
    cat_dir = './cat'
    enc_path = enc_dir+'/'+feature_name+'.npz'
    cat_path = cat_dir+'/categories.pkl'
    encoding = False
    sparse_matrix = None
    category = None
    all_cat = {}
    
    #creates directories if not exists
    if not os.path.exists(enc_dir):
        encoding = True
        os.makedirs(enc_dir)
    if not os.path.exists(cat_dir):
        encoding = True
        os.makedirs(cat_dir)
        
    #gets the dictionary with all categories
    try:
        all_cat = load_obj(cat_path)
    except:
        pass
    
    #deletes files if they exists and new_encoding is True
    if new_encoding:
        encoding = True
        try:
            os.remove(enc_path)
        except:
            pass
        try:
            del all_cat[feature_name]
        except:
            pass
    
    #tries to get sparse matrix & head if new_encoding == False
    if not encoding:
        try:
            sparse_matrix = scipy.sparse.load_npz(enc_path)
            #check if size of sparse_matrix matches size of the df
            if sparse_matrix.shape[0] != df.shape[0]:
                sparse_matrix = None
            category = all_cat.get(feature_name)
        except:
            pass
    
    #perform actual encoding if necessary
    if (sparse_matrix is None) or (category is None):
        if df.loc[:, [feature_name]].isnull().values.any():
            df.loc[:, feature_name] = df.loc[:, feature_name].replace(np.nan, feature_name + na_delimiter, regex = True)
        selected_frame = delimiter + df.loc[:, feature_name].astype(str).to_frame()
        sparse_matrix = enc.fit_transform(selected_frame)
        category = enc.categories_[0]
        all_cat[feature_name] = category
        scipy.sparse.save_npz(enc_path, sparse_matrix)
        save_obj(all_cat, cat_path)
    
    #add encoded matrix to the df
    encoded_array = sparse_matrix.toarray()
    df_enc = pd.DataFrame(data = encoded_array, columns = category)
    category_sorted = np.sort(category)
    df_enc = df_enc[category_sorted]
    df = df.drop(columns = [feature_name])
    df[category_sorted] = df_enc
    return df, category_sorted

def save_obj(obj, path):
    if len(path) < 5:
        path = path + '.pkl'
    elif path[-4:] != '.pkl':
        path = path + '.pkl'
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path):
    if len(path) < 5:
        path = path + '.pkl'
    elif path[-4:] != '.pkl':
        path = path + '.pkl'
    with open(path, 'rb') as f:
        return pickle.load(f)

def get_prepared_df(df, na, target_variable = None, categorical_features = [], numerical_features = [], additional_features = [], new_encoding = False):
    """returns the data frame with the specified encoded features and the corresponding generated categories
    set `new_encoding` = True if the data are not in the same order as before!!! otherwise it returns a false encoding"""
    df_selected = df.loc[:, (numerical_features + categorical_features + additional_features)]
    categories = {}
    df_selected = normalize_df(df_selected, numerical_features)
    for feature in categorical_features:
        df_selected, categories[feature] = one_hot_encoding(df_selected, na, feature, new_encoding)
    if target_variable is not None:
        df_selected[target_variable] = df.loc[:, target_variable]
    return df_selected, categories

def normalize_df(df, features):
    """normalizes the given numerical sample"""
    #df[features] = StandardScaler().fit_transform(df.loc[:, features])
    X = df.loc[:, features].to_numpy()
    X -= np.mean(X, axis = 0)
    X /= np.std(X, axis = 0)
    df[features] = X
    return df

def mape(y_true, y_pred): 
    """calculates the mean absolute percentage error (MAPE) of a continuous predicted variable"""
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def load_mlp_res(path):
    if not os.path.exists(path):
        df_mlp = pd.DataFrame(columns = ['features', 'layers', 'learning_rate', 'alpha', 'batch_size', 'residuals', 'MAPE'])
    else:
        df_mlp = load_obj(path)
    return df_mlp

def select_df(df, na, selected_features, missing_values_included = False, reset_idx = True):
    df = df.copy()
    cols_in = set(df.columns)
    if missing_values_included:
        df = add_nearest_rooms_by_surface(df)
        df = add_surface_mean_by_nearest_rooms(df)
    else:
        df = df.loc[:, selected_features]
        if year_renovated in df.columns:
            year_renovated_frame = df.loc[:, [year_renovated]]
            df = df.drop(columns = [year_renovated])
            df = df.dropna()
            df[year_renovated] = year_renovated_frame.iloc[df.index, :]
            df = handle_missing_numeric_feature(df, year_renovated, na)
        else:
            df = df.dropna()
        if reset_idx:
            df = df.reset_index(drop = True)
    cols_out = set(df.columns)
    add_features = list(cols_out - cols_in)
    return df, add_features

## Neural Network

In [8]:
def launch_NN(X_train, y_train, X_test, y_test, features):
    layers, learning_rate, alpha, batch_size, max_iter, plot_error = (80,), 0.01, 0.001, 'none', 1000, True
    mlp = MLP(layers, learning_rate = learning_rate, alpha = alpha, batch_size = batch_size, max_iter = max_iter, plot_error = plot_error)
    X_train, y_train, X_test, y_test = X_train.to_numpy(), y_train.to_numpy().T[0], X_test.to_numpy(), y_test.to_numpy().T[0]
    mlp.fit(X_train, y_train)
    predicted = mlp.predict(X_test)
        
    m = mape(y_test, predicted)
    residuals = y_test - predicted
    
    df_mlp_path = './mlp_res.pkl'
    result = [features, layers, learning_rate, alpha, batch_size, residuals, m]
    df_mlp = load_mlp_res(df_mlp_path)
    series = pd.Series(result, index = df_mlp.columns)
    df_mlp = df_mlp.append(series, ignore_index = True)
    save_obj(df_mlp, df_mlp_path)
    
    mlp.store('./NN_1.pkl')
    return df_mlp
 

### MLP Validation

In [9]:
def init(df, na, missing_values_included = False):
    #initialization and declaration of the variables
    #obj_type', 'rooms', 'surface', 'zip_code_2_digits', 'canton', 
    #'year_built', 'year_renovated','distance_to_station', 'price_square_metres', 'price'
    cat_f = [obj_type, canton, zip_code_2_digits]
    compl_num_f = [surface, rooms, distance_to_station]
    miss_num_f = [year_renovated, year_built]
    targ_var = [price]
    num_f = compl_num_f + miss_num_f
    features = cat_f + compl_num_f + miss_num_f
    selected_features = features + targ_var
    
    df, add_f = select_df(df, na, selected_features, missing_values_included)
    df, categories = get_prepared_df(df, na, targ_var[0], cat_f, num_f, add_f, new_encoding = True)
    df_y = df.loc[:, targ_var]
    df_X = df.drop(targ_var, axis = 1)
    
    X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.25, random_state = 42)
    df_mlp = launch_NN(X_train, y_train, X_test, y_test, features)
    return df_mlp
    
init(df, na)

MLP(layers=(80,), alpha=0.001, learning_rate=0.01)


FigureWidget({
    'data': [{'type': 'scatter', 'uid': 'b9ed746c-0691-44bc-ba7d-1d21414f0e0d', 'y': []}],
    …

num its = 990 		 normgrad = 2303.1697010710923 		 cost = 1288384485.9444222
fit completed!
number of iterations:  999
max iterations:  1000
norm(grad):  1321.2644056111449
cost:  1194599565.1315658


Unnamed: 0,features,layers,learning_rate,alpha,batch_size,residuals,MAPE
0,"[obj_type, canton, zip_code_2_digits, surface,...","(80,)",0.01,0.0,none,"[-244.48175387548167, -307.262058828574, -116....",11.849378
1,"[obj_type, canton, zip_code_2_digits, surface,...","(80,)",0.01,0.0,none,"[-203.2536971220245, 6.011481906470408, -210.0...",11.435508
2,"[obj_type, canton, zip_code_2_digits, surface,...","(80,)",0.01,0.0,none,"[-216.5675113669447, -159.86110521480646, -101...",11.308059
3,"[obj_type, canton, zip_code_2_digits, surface,...","(80,)",0.01,0.001,none,"[-179.822233826912, -36.175313266719286, -339....",11.768765
4,"[obj_type, canton, zip_code_2_digits, surface,...","(80,)",0.01,0.001,none,"[-251.0008088320269, -175.47334544745172, -296...",11.352812
5,"[obj_type, canton, zip_code_2_digits, surface,...","(80,)",0.01,0.001,none,"[-212.62448131932172, 58.46136521125368, -109....",10.606158
