In [1]:
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import os
import pandas as pd
import matplotlib.pyplot as plt 

In [52]:
y_column_name = "Ständige Wohnbevölkerung Bevölkerungs-dichte1 in Pers./km2"

data = pd.read_excel('C:\\Users\\jonas.hodel\\Downloads\\HSLU\\merged_typologien.xlsx')

indexNames = data[data['Gemeinde'] == 'Kanton Luzern'].index
data.drop(indexNames , axis=0, inplace=True)
data = data.reset_index()
data.drop('index',  axis=1, inplace=True)

data = pd.get_dummies(data, columns=['Gemeindetypologien', 'Gemeinde'])


train = data[data['Jahr'] < 2017]
verification = data[(data['Jahr'] >= 2017) & (data['Jahr'] <= 2018)]
test = data[data['Jahr'] > 2018]

y_train = train[y_column_name]
y_verification = verification[y_column_name]
y_test = test[y_column_name]

x_train = train.drop(y_column_name, axis=1)
x_verification = verification.drop(y_column_name, axis=1)
x_test = test.drop(y_column_name, axis=1)

jahre = list(range(1991,2019))

x_columns = x_train.columns

scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_verification = scaler.transform(x_verification)
x_test = scaler.transform(x_test)

x_train = pd.DataFrame(x_train, columns=x_columns)
x_verification = pd.DataFrame(x_verification, columns=x_columns)
x_test = pd.DataFrame(x_test, columns=x_columns)

In [38]:
gemeinden = data.columns.tolist()[8:]
gemeinden = gemeinden[-1:]
gemeinden_typologien = data.columns.tolist()[3:8]

In [39]:
def evaluate(folder, y, y_pred):
    with open(f'{folder}/metric.txt', 'w') as f:
        f.write(f"R2: {r2_score(y, y_pred)} \n")
        f.write(f"MAPE: {mean_absolute_percentage_error(y, y_pred)}")

In [40]:
def clean_output(folder): 
    if not os.path.exists(folder):
        os.makedirs(folder)
    else: 
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)
            try:
                if os.path.isfile(file_path):
                    os.remove(file_path)
            except Exception as e:
                print(f"Error deleting file: {file_path} - {e}")

In [56]:
models = [
    ('LinearRegression', LinearRegression()), 
    ('DecisionTreeRegressor', DecisionTreeRegressor()), 
    ('RandomForestRegressor', RandomForestRegressor()), 
    ('SVR 100', SVR(C=100)), 
    ('SVR 98', SVR(C=98)), 
    ('SVR 95', SVR(C=95)), 
    ('SVR 90', SVR(C=90)), 
    ('GradientBoostingRegressor', GradientBoostingRegressor())
]

# models = [
#     ('LinearRegression', LinearRegression())
# ]

for name, model in models:
    print("Trying model:", name)

    directory = f'./{name}'

    clean_output(directory)

    model.fit(x_train, y_train)
    y_pred = model.predict(x_verification)

    evaluate(directory, y_verification, y_pred)

    plot_gemeinde(directory, 
        [x_train, x_verification], 
        [pd.Series(y_train, name='y'), pd.Series(y_verification, name='y')], 
        [pd.Series(model.predict(x_train), name='pred'), pd.Series(y_pred, name='pred')]
    )
    
    plot_gemeinde_typologie(directory, x_verification, y_verification, y_pred)

Trying model: LinearRegression
Trying model: DecisionTreeRegressor
Trying model: RandomForestRegressor
Trying model: SVR 100
Trying model: SVR 98
Trying model: SVR 95
Trying model: SVR 90
Trying model: GradientBoostingRegressor


In [55]:
def plot_gemeinde(folder, x_list, y_list, y_pred_list):
    x = pd.concat(x_list, axis=0).reset_index()
    y = pd.concat(y_list, axis=0).reset_index()
    y_pred = pd.concat(y_pred_list, axis=0).reset_index()

    df = pd.concat([pd.DataFrame(x), y, y_pred], axis=1)

    for i in range(len(gemeinden)):

        gemeinde_data = df[df[gemeinden[i]] > 0]

        gemeinde_data_y = gemeinde_data['y']
       
        gemeinde_pred = gemeinde_data['pred']

        plt.figure()
        plt.scatter(jahre, gemeinde_data_y)
        plt.plot(jahre, gemeinde_pred, color="black")
        plt.scatter(jahre[-3:], gemeinde_pred[-3:], color='red')
        plt.scatter(jahre[-5:-3], gemeinde_pred[-5:-3], color='green')
        plt.savefig(f'{folder}/{gemeinden[i]}.png')
        plt.close()


In [9]:
def plot_gemeinde_typologie(folder, x, y, y_pred):
    
    df = pd.concat([pd.DataFrame(x), pd.Series(y, name="y").reset_index(), pd.Series(y_pred, name="pred").reset_index()], axis=1)

    df['Delta'] = df['pred'] - df['y']   

    df['Delta_Percent'] = 100
    df['Delta_Percent'] = (abs(df['Delta']) / df['pred'] ) * 100

    plt.figure()

    violin_parts = plt.violinplot(
        [
            df[df['Gemeindetypologien_Aggloguertel'] > 0]['Delta_Percent'],
            df[df['Gemeindetypologien_Agglokern'] > 0]['Delta_Percent'],
            df[df['Gemeindetypologien_Kern'] > 0]['Delta_Percent'],
            df[df['Gemeindetypologien_Land'] > 0]['Delta_Percent'],
            df[df['Gemeindetypologien_Stadt'] > 0]['Delta_Percent']

        ],
        showmeans=False,
        showmedians=True)

    plt.xticks([1, 2, 3, 4, 5], ['Aggloguertel', 'Agglokern', 'Kern', 'Land', 'Stadt'])

    plt.title("Absolute Percentage error of predictions")
    plt.ylabel('Absolute Percentage Error')
    plt.xlabel('Gemeindekategorien')

    for pc in violin_parts['bodies']:
        pc.set_facecolor('red')
        pc.set_color('red')
        pc.set_edgecolor('black')

    for partname in ('cbars','cmins','cmaxes','cmedians'):
        vp = violin_parts[partname]
        vp.set_edgecolor('red')
        vp.set_linewidth(1)

    plt.savefig(f'{folder}/topologien.png')
    plt.close()


In [62]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

model = LinearRegression()

poly_column_names = x_columns.copy().to_numpy()

for degree in range(1, 5):   
    print("Trying model with degree:", degree)
    directory = f'./PolynomialLinearRegression{degree}'

    clean_output(directory)

    poly = PolynomialFeatures(degree=degree, include_bias=True)

    

    x_train_poly_features = pd.DataFrame(poly.fit_transform(x_train))
    x_verification_poly_features = pd.DataFrame(poly.fit_transform(x_verification))

    poly_column_names = np.concatenate((poly_column_names, [f'poly{degree}{x}' for x in range(x_train_poly_features.shape[1] - len(poly_column_names))]), axis=0)

    x_train_poly_features.columns = poly_column_names
    x_verification_poly_features.columns = poly_column_names


    model.fit(x_train_poly_features, y_train)
    y_pred = model.predict(x_verification_poly_features)

    evaluate(directory, y_verification, y_pred)
    
    plot_gemeinde(directory, 
        [x_train_poly_features, x_verification_poly_features], 
        [pd.Series(y_train, name='y'), pd.Series(y_verification, name='y')], 
        [pd.Series(model.predict(x_train_poly_features), name='pred'), pd.Series(y_pred, name='pred')]
    )

    # plot_gemeinde_typologie(directory, pd.DataFrame(x_train_poly_features, columns=x_columns_poly), y_verification, y_pred)
    

Trying model with degree: 1
87
88
Trying model with degree: 2
88
3916
Trying model with degree: 3
3916
117480
Trying model with degree: 4
117480
2672670


MemoryError: Unable to allocate 41.4 GiB for an array with shape (2080, 2672670) and data type float64