Imports

In [141]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import re
from pymongo import MongoClient
import mysql.connector
from mysql.connector import Error
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from dotenv import load_dotenv



Define methods for connection with databases

In [142]:
load_dotenv()
host = os.getenv("DBHost")
mongoPort = os.getenv("mongoPort")
mySQLPort = os.getenv("mySQLPort")
user = os.getenv("DBUser")
password = os.getenv("DBPass")

def DBTGetMongoConnection(database, collection):
    client = MongoClient('mongodb://'+user+":"+password+"@"+host+":"+str(mongoPort))
    
    db = client[database]
    if collection not in db.list_collection_names():
        return LookupError
    collection = db[collection]
    return collection

def DBTGetMySQLConnection(database):
    try:
        connection = mysql.connector.connect(
            host=host,
            port=mySQLPort,
            database = database,
            user=user,
            password=password
        )

        if connection.is_connected():
            return connection

    except Error as e:
        print("Error while connecting to MySQL", e)

SQL Helpers

In [143]:
# Gets all items from the selected table


def DBTGetAllItemsInSQLTable(cursor, table):
    query = f"SELECT * FROM {table}"
    cursor.execute(query)
    entries = cursor.fetchall()
    columnNames = [desc[0] for desc in cursor.description]
    return pd.DataFrame(entries,columns=columnNames)

# ReEncoder to fix some encoding errors in the database causing mismatches with the kadaster dataset
def DBTReEncodeAddresses(dataframe: pd.DataFrame, columnName):
    dataframe[columnName] = dataframe[columnName].astype(str).apply(lambda x: x.encode('latin-1',"ignore").decode('utf8'))
    return dataframe

def DBTConvertEnergyLabels(dataframe: pd.DataFrame, columnName):
    alphabet = 'abcdefghijklmnopqrstuvwxyz'    
    def get_score(label):
        if label is not None:
            letter = label.lower().replace("+", "")
            score = alphabet.index(letter) + 1 - label.count("+")
            return score
        return 100
    
    dataframe[columnName] = dataframe[columnName].apply(get_score)
    
    return dataframe
        

def DBTConcatMySQLAddresses(dataframe: pd.DataFrame):
    dataframe["adres"] = (
        dataframe["straat"].astype(str) 
        +" "
        +dataframe["huisnummer"].astype(str)
        +dataframe["huisletter"].apply(lambda x: x if x is not None else "").apply(str.upper)
        +dataframe["huisnummertoevoeging"].apply(lambda x: " "+ x if x is not None else "")
    )
    return dataframe

Gather data from MySQL

In [144]:
SQLConnection = DBTGetMySQLConnection("dbDEDSv2")
gebouwData = DBTGetAllItemsInSQLTable(SQLConnection.cursor(),"Gebouw")
prijsData = DBTGetAllItemsInSQLTable(SQLConnection.cursor(),"PrijsBepaling")
# Use the reEncoder to fix some encoding errors in the "straat" column and format the loose address parts into one column called "adres"
# LocatieData = DBTConcatMySQLAddresses(DBTReEncodeAddresses(locatieData,"straat"))
prijsData = pd.merge(prijsData, gebouwData, left_on="gebouw_id", right_on="id", how="left")
prijsData = DBTConvertEnergyLabels(prijsData, "energielabel")
prijsData

Unnamed: 0,id_x,prijs,datum,gebouw_id,id_y,Oppervlakte,energielabel,bouwjaar,locatie_id,woningType_id
0,1,1700000,2023-05-24,230166,230166,189,-2,1919,230173,15.0
1,2,950000,2023-05-24,68564,68564,172,1,1997,68572,15.0
2,3,650000,2023-05-24,77169,77169,149,3,1935,77176,2.0
3,4,419000,2023-05-24,41414,41414,114,1,2003,41421,2.0
4,5,1675000,2023-05-24,254290,254290,268,3,1873,254297,15.0
...,...,...,...,...,...,...,...,...,...,...
16460,16461,142000,2018-01-01,16432,16432,86,100,1938,16432,
16461,16462,129000,2017-01-01,16432,16432,86,100,1938,16432,
16462,16463,124000,2016-01-01,16432,16432,86,100,1938,16432,
16463,16464,122000,2015-01-01,16432,16432,86,100,1938,16432,


Format data for fitting

In [145]:
prijsData['datum'] = prijsData['datum'].str.replace("-","")
prijsData.convert_dtypes()
prijsData = prijsData.drop(columns=['id_x','id_y','prijs','prijs2'],axis=1).dropna()
prijsData['prijs2'] = prijsData['prijs'].astype(str)
x = prijsData
y = prijsData['prijs2']



X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

KeyError: 'prijs2'

Fit the algortihm

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [None]:
predictions = model.predict(x)