In [32]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt

In [33]:
data = pd.read_excel('data/flats_to_rent_wue_preprocessed_0407.xlsx')

In [34]:
data.replace('""', np.nan, inplace=True)

In [35]:
data['ZipCode']= data['ZipCode'].astype(str)
data["LivingSpace"] = pd.to_numeric(data["LivingSpace"], errors="coerce")
data["Rooms"] = pd.to_numeric(data["Rooms"], errors="coerce")
data['LivingSpace']= data['LivingSpace'].astype(float)
data['Rooms']= data['Rooms'].astype(float)

data.drop('Unnamed: 0', axis=1, inplace=True)


In [36]:
data.dropna(inplace=True)

In [37]:
data = data.reindex()
data = data.reset_index(drop=True)

y = data['Object_price']
X = data.drop('Object_price', axis=1)

In [38]:
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                  train_size=0.8, 
                                                  random_state = 0)
train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, 
                                                  train_size=0.8, 
                                                  random_state = 0)

num_cols = [col for col in train_X.columns if train_X[col].dtype == 'float64']
cat_cols = [col for col in train_X.columns if train_X[col].dtype == 'object']
bin_cols = [col for col in train_X.columns if train_X[col].dtype == 'int64']

In [39]:
print(cat_cols)

['ZipCode', 'EstateType', 'DistributionType']


In [40]:
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_categorical_xgb = encoder.fit_transform(train_X[cat_cols]).toarray()
X_valid_categorical_xgb = encoder.transform(val_X[cat_cols]).toarray()
X_test_categorical_xgb = encoder.transform(test_X[cat_cols]).toarray()
X_train_processed = np.concatenate([train_X[num_cols], X_train_categorical_xgb], axis=1)
X_valid_processed = np.concatenate([val_X[num_cols], X_valid_categorical_xgb], axis=1)
X_test_processed = np.concatenate([test_X[num_cols], X_test_categorical_xgb], axis=1)
X_train_processed = np.concatenate([train_X[bin_cols], X_train_processed], axis=1)
X_valid_processed = np.concatenate([val_X[bin_cols], X_valid_processed], axis=1)
X_test_processed = np.concatenate([test_X[bin_cols], X_test_processed], axis=1)

In [41]:
model = xgb.XGBRegressor(eval_metric=['rmse', 'mae'], early_stopping_rounds=30)
model.fit(X=X_train_processed, 
          y=train_y,
          eval_set=[(X_valid_processed, val_y)],  # Liste mit Validierungsdaten und Zielvariablen
          verbose=True
)

preds= model.predict(X_test_processed)

score_House = mean_absolute_error(test_y, preds)
print("MAE Flat Rent price: {}".format(score_House))


[0]	validation_0-rmse:840.51828	validation_0-mae:721.40019
[1]	validation_0-rmse:654.43103	validation_0-mae:547.48827
[2]	validation_0-rmse:523.88695	validation_0-mae:416.83377
[3]	validation_0-rmse:456.05197	validation_0-mae:338.67250
[4]	validation_0-rmse:397.80292	validation_0-mae:275.64428
[5]	validation_0-rmse:360.42472	validation_0-mae:243.43076
[6]	validation_0-rmse:337.78332	validation_0-mae:227.43111
[7]	validation_0-rmse:316.21329	validation_0-mae:211.04030
[8]	validation_0-rmse:309.88608	validation_0-mae:204.71773
[9]	validation_0-rmse:307.05221	validation_0-mae:202.36452
[10]	validation_0-rmse:312.47071	validation_0-mae:208.32134
[11]	validation_0-rmse:307.04199	validation_0-mae:204.40209
[12]	validation_0-rmse:310.10258	validation_0-mae:208.17123
[13]	validation_0-rmse:307.65755	validation_0-mae:208.55877
[14]	validation_0-rmse:305.80781	validation_0-mae:207.57024
[15]	validation_0-rmse:305.27005	validation_0-mae:207.77153
[16]	validation_0-rmse:304.69467	validation_0-mae:

In [42]:
model2 = lgb.LGBMRegressor(metric=['rmse', 'mae'], early_stopping_rounds=30)
model2.fit(X=X_train_processed, 
          y=train_y,
          eval_set=[(X_valid_processed, val_y)], 
          verbose=True
)

preds= model2.predict(X_test_processed)
score = mean_absolute_error(test_y, preds)

print("MAE: {}".format(score))

[1]	valid_0's rmse: 490.359	valid_0's l1: 336.881
[2]	valid_0's rmse: 469.541	valid_0's l1: 317.102
[3]	valid_0's rmse: 454.514	valid_0's l1: 304.496
[4]	valid_0's rmse: 443.833	valid_0's l1: 298.483
[5]	valid_0's rmse: 436.122	valid_0's l1: 296.878
[6]	valid_0's rmse: 436.064	valid_0's l1: 293.63
[7]	valid_0's rmse: 428.69	valid_0's l1: 290.688
[8]	valid_0's rmse: 423.351	valid_0's l1: 280.909
[9]	valid_0's rmse: 418.479	valid_0's l1: 282.773
[10]	valid_0's rmse: 414.512	valid_0's l1: 276.47
[11]	valid_0's rmse: 412.12	valid_0's l1: 278.738
[12]	valid_0's rmse: 413.312	valid_0's l1: 274.401
[13]	valid_0's rmse: 413.725	valid_0's l1: 270.05
[14]	valid_0's rmse: 411.559	valid_0's l1: 271.238
[15]	valid_0's rmse: 407.049	valid_0's l1: 267.837
[16]	valid_0's rmse: 404.501	valid_0's l1: 264.295
[17]	valid_0's rmse: 406.046	valid_0's l1: 259.266
[18]	valid_0's rmse: 405.155	valid_0's l1: 262.896
[19]	valid_0's rmse: 401.888	valid_0's l1: 260.076
[20]	valid_0's rmse: 401.274	valid_0's l1: 26



In [52]:
prediction1 = model.predict(test_X.iloc[[0]])
print("Prediction: {}".format(prediction1))

ValueError: Feature shape mismatch, expected: 104, got 85