# Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.spatial.distance import cdist
from sklearn.preprocessing import OneHotEncoder
from pandas.api.types import is_numeric_dtype

In [2]:
na = '<NA>'

df = pd.read_csv(r'../data_file/selected_data.csv')
df = df[df.loc[:,] != na].drop(columns = [df.columns[0]])

obj_type, rooms, surface, zip_code_2_digits, zip_code_3_digits, canton, year_built, year_renovated, distance_to_station, price_square_metres, price = df.columns

target_frame = df.loc[:, [price]]

  result = method(y)


#### Helper Functions

In [3]:
def frame_to_numeric(frame):
    """transforms every value of a data frame to numeric if possible"""
    for column in frame.columns:
        try:
            frame.loc[:, column] = pd.to_numeric(frame.loc[:, column])
        except:
            continue #nothing toDo here
    return frame

df = frame_to_numeric(df)

In [4]:
def rooms_surface_mean(df):
    df = frame_to_numeric(df.loc[:, [rooms, surface]].dropna())
    return df.groupby(rooms).mean()

def add_rooms(row, df_rooms_mean):
    return df_rooms_mean.loc[row[rooms]]

def add_nearest_rooms_by_surface(df):
    """completes the dataframe rooms by the nearest neighbour based on the surface mean distance
    and the surface of the missing rooms objects"""
    df = df.reset_index(drop = True)
    df_rooms_mean = rooms_surface_mean(df)
    df_rooms_missing = df[df.loc[:, rooms].isna()].loc[:, [surface]]
    dist = cdist(df_rooms_missing, df_rooms_mean) #computes the distance between each pair of surface
    idx = np.argsort(dist) #gets sorted index (most left = lowest distance index)
    rooms_class = idx[:,0] #gets the room class (index)
    df_rooms_missing.loc[:, rooms] = rooms_class
    df_rooms_mean = df_rooms_mean.reset_index()
    df_rooms_missing.loc[:, rooms] = df_rooms_missing.apply(lambda row: add_rooms(row, df_rooms_mean), axis = 1)
    df.loc[df_rooms_missing.index, rooms] = df_rooms_missing.loc[:, rooms] #appends the missing rooms to the real data-frame
    return df

df = add_nearest_rooms_by_surface(df)

In [5]:
def add_surface(row, df_rooms_mean):
    return df_rooms_mean.loc[row[surface]]

def add_surface_mean_by_nearest_rooms(df):
    """completes the dataframe surface by the nearest neighbour surface mean based on the room distance
    of the missing surface objects"""
    df = df.reset_index(drop = True)
    df_rooms_mean = rooms_surface_mean(df).reset_index()
    df_surface_missing = df[df.loc[:, surface].isna()].loc[:, [rooms]]
    dist = cdist(df_surface_missing, df_rooms_mean.loc[:, [rooms]]) #computes the distance between each pair of rooms
    idx = np.argsort(dist) #gets sorted index (most left = lowest distance index)
    surface_class = idx[:,0] #gets the surface class (index)
    df_surface_missing.loc[:, surface] = surface_class
    df_surface_missing.loc[:, surface] = df_surface_missing.apply(lambda row: add_surface(row, df_rooms_mean), axis = 1)
    df.loc[df_surface_missing.index, surface] = df_surface_missing.loc[:, surface]
    return df


df = add_surface_mean_by_nearest_rooms(df)

In [6]:
def one_hot_encoding(df, feature_name):
    """performs a complete one-hit encoding on the specified feature of the dataframe
    adds a the prefix '_' to each unique encoding class if the input is numeric
    separates automatically all missing values as it's own category with: feature_name+'_'+na
    returns: 
    -encoded data-frame
    -categories of the variable as an array
    """
    enc = OneHotEncoder()
    df = df.copy()
    
    if df.loc[:, [feature_name]].isnull().values.any():
        df.loc[:, feature_name] = df.loc[:, feature_name].replace(np.nan, feature_name+'_'+na, regex = True)
    
    if is_numeric_dtype(df.loc[:, feature_name]):
        selected_frame = '_' + df.loc[:, feature_name].astype(str).to_frame()
    else:
        selected_frame = df.loc[:, feature_name].to_frame()

    encoded_array = enc.fit_transform(selected_frame).toarray()
    df_enc = pd.DataFrame()
    categories = enc.categories_[0]
    for i in range(encoded_array.shape[1]):
        object_name = categories[i]
        df_enc[object_name] = encoded_array[:, i]
    categories_sorted = np.sort(categories)
    df_enc = df_enc[categories_sorted]
    df = df.drop(columns = [feature_name])
    df[categories_sorted] = df_enc
    return df, categories_sorted

def one_hot_encoding_controller(df, features):
    """encodes the given features with the one-hit encoding method in the given features order
    returns: 
    -complete encoded data-frame
    -dictionary of the encoded categories to provide easy access"""
    categories = {}
    for feature in features:
        df, categories[feature] = one_hot_encoding(df, feature)
    return df, categories

In [13]:
def get_selected_df(df, categories = {}, features = []):
    """get the selected df based on the given column names of the non-one-hot encoded frame"""
    columns = []
    for feature in features:
        if feature in categories:
            columns.extend(categories[feature])
        else:
            columns.append(feature)
    return df.loc[:, columns]

def get_prepared_df(df, encoding_features = [], other_features = []):
    df_encoded, categories = one_hot_encoding_controller(df, features = encoding_features)
    all_features = other_features + encoding_features
    df_selected = get_selected_df(df_encoded, categories, features = all_features)
    return df_selected, categories
    
df_prepared, categories = get_prepared_df(df, 
                                          encoding_features = [canton, zip_code_2_digits, zip_code_3_digits], 
                                          other_features = [surface, rooms])

## Linear Regression

In [14]:
def launch_regression(x, y):
    lr = LinearRegression()
    lr.fit(x, y)
    return lr.score(x, y)
    
launch_regression(x = df_prepared, y = target_frame)

0.6905754688752932