In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import utils.remove_cols as remove_cols
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler

# IMPORTIEREN

In [37]:
df = pd.read_csv("immoscout_cleaned_lat_lon_fixed_v7.csv")

  df = pd.read_csv("immoscout_cleaned_lat_lon_fixed_v7.csv")


# BEREINIGEN / IMPUTIEREN

# TRANSFORMIEREN

In [38]:
# create dictionary with object-type information
types_nrs = dict(enumerate(df["type"].unique()))
types_name = {v: k for k, v in types_nrs.items()}
print(types_nrs, types_name)
# change object-type to numeric value
df["type"] = df["type"].map(types_name)

# keep only cols with type int or float
df = df.select_dtypes(include=["int64", "float64"])
# remove specific columns
df_remove = pd.read_excel("fixtures/remove_cols.xlsx")
df_remove = remove_cols.remove_chars_space(df_remove, "feature")
dct = remove_cols.get_dct_from_df(df_remove)
assert len(dct) == len(df.columns)
df = df[[k for k, v in dct.items() if v == 1]]

# test
assert len(df.columns) == 30
assert df.values.shape[0] != 0

{0: 'penthouse', 1: 'terrace-house', 2: 'detached-house', 3: 'flat', 4: 'stepped-house', 5: 'farmhouse', 6: 'semi-detached-house', 7: 'stepped-apartment', 8: 'duplex-maisonette', 9: 'attic-flat', 10: 'loft', 11: 'chalet', 12: 'villa', 13: 'attic-room', 14: 'secondary-suite', 15: 'castle', 16: 'detached-secondary-suite', 17: 'studio', 18: 'furnished-residential-property', 19: 'rustico', 20: 'single-room'} {'penthouse': 0, 'terrace-house': 1, 'detached-house': 2, 'flat': 3, 'stepped-house': 4, 'farmhouse': 5, 'semi-detached-house': 6, 'stepped-apartment': 7, 'duplex-maisonette': 8, 'attic-flat': 9, 'loft': 10, 'chalet': 11, 'villa': 12, 'attic-room': 13, 'secondary-suite': 14, 'castle': 15, 'detached-secondary-suite': 16, 'studio': 17, 'furnished-residential-property': 18, 'rustico': 19, 'single-room': 20}


# PREPARE DATA

In [39]:
# drop nan because the model can't handle them
df = df.dropna() # 1016 / 13378 = 7.6% of data is lost
# make Matrix y as target with column type
y = df["type"].values[:, np.newaxis]
# make Matrix X as features with all columns except type
X = df.drop(["type"], axis=1).values
assert type(y) == np.ndarray
assert type(X) == np.ndarray
# standardize X
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# FIT, PREDICT, SCORE

In [40]:
X_train.shape, X_test.shape

((9889, 29), (2473, 29))

In [41]:
import torch

In [42]:
# create tensors from numpy arrays
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test  = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test  = torch.from_numpy(y_test.astype(np.float32))
X_train.shape

torch.Size([9889, 29])

In [43]:
# Create Class AutoEncoder
class AutoEncoder(torch.nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_hidden_layer = torch.nn.Linear(
            in_features=kwargs["n_features"], out_features=128 # in_features = [n_samples, n_features]
        )
        self.encoder_output_layer = torch.nn.Linear(
            in_features=128, out_features=128
        )
        self.decoder_hidden_layer = torch.nn.Linear(
            in_features=128, out_features=128
        )
        self.decoder_output_layer = torch.nn.Linear(
            in_features=128, out_features=kwargs["n_features"]
        )

    def forward(self, features):
        activation = self.encoder_hidden_layer(features)
        activation = torch.relu(activation)
        code = self.encoder_output_layer(activation)
        code = torch.relu(code)
        activation = self.decoder_hidden_layer(code)
        activation = torch.relu(activation)
        activation = self.decoder_output_layer(activation)
        reconstructed = torch.relu(activation)
        return reconstructed

In [45]:
model = AutoEncoder(n_features=29)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
criterion = torch.nn.MSELoss()

In [None]:
for epoch in range(1000):
    optimizer.zero_grad()
    outputs = model()