In [1]:
import pandas as pd
import numpy as np


In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

# Data input and transformation

In [130]:
df = pd.read_csv("data/German_rentals.csv")

TypeError: parser_f() got an unexpected keyword argument 'index'

In [4]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

newdf = df.select_dtypes(include=numerics)

In [5]:
num_cols_base = ["baseRent","serviceCharge",'totalRent',"heatingCosts","floor","numberOfFloors","noRooms","livingSpace", "yearConstructed", 
           "noParkSpaces"]

In [6]:
bool_cols_base = ["newlyConst","balcony","hasKitchen","lift","garden"]

In [7]:
cat_cols_base = ["regio1","regio2","regio3",
            "condition","typeOfFlat","interiorQual",
           "heatingType","petsAllowed"]

In [8]:
all_cols = num_cols_base + cat_cols_base + bool_cols_base

In [9]:
df.columns

Index(['regio1', 'serviceCharge', 'heatingType', 'telekomTvOffer',
       'telekomHybridUploadSpeed', 'newlyConst', 'balcony', 'picturecount',
       'pricetrend', 'telekomUploadSpeed', 'totalRent', 'yearConstructed',
       'scoutId', 'noParkSpaces', 'firingTypes', 'hasKitchen', 'geo_bln',
       'cellar', 'yearConstructedRange', 'baseRent', 'houseNumber',
       'livingSpace', 'geo_krs', 'condition', 'interiorQual', 'petsAllowed',
       'street', 'streetPlain', 'lift', 'baseRentRange', 'typeOfFlat',
       'geo_plz', 'noRooms', 'thermalChar', 'floor', 'numberOfFloors',
       'noRoomsRange', 'garden', 'livingSpaceRange', 'regio2', 'regio3',
       'description', 'facilities', 'heatingCosts', 'energyEfficiencyClass',
       'lastRefurbish', 'electricityBasePrice', 'electricityKwhPrice', 'date'],
      dtype='object')

In [10]:
columns_dict = {'totalRent':"total_rent",
                "heatingCosts":"heating_costs",
                'numberOfFloors':"floors_num",
                'noRooms':"rooms",
                'livingSpace':"area", 
                'baseRent':"base_rent",
                'yearConstructed':"build_year", 
                'serviceCharge':"service_charge", 
                'noParkSpaces':"parking_spaces",
                'newlyConst':"newly_constructed", 
                'hasKitchen':"kitchen",
                'regio1':"region",
                'regio2': 'subregion',
                'regio3':"city",
                'typeOfFlat':"type",
                'interiorQual':"interior_quality",
                'petsAllowed':"pets_allowed",
                'heatingType':"heating_type",
              }

In [11]:
df = df[all_cols]

In [12]:
df.rename(columns = columns_dict, inplace=True)

# Basic transformations

## Total Rent

In [13]:
df.isna().sum()

base_rent                 0
service_charge         6909
total_rent            40517
heating_costs        183332
floor                 51309
floors_num            97732
rooms                     0
area                      0
build_year            57045
parking_spaces       175798
region                    0
subregion                 0
city                      0
condition             68489
type                  36614
interior_quality     112665
heating_type          44856
pets_allowed         114573
newly_constructed         0
balcony                   0
kitchen                   0
lift                      0
garden                    0
dtype: int64

In [14]:
def calculate_total_rent(row):
    if np.isnan(row.total_rent):
        row = row.fillna(0)
        total_rent = row.base_rent + row.service_charge + row.heating_costs
        
    else:
        total_rent = row.total_rent
    return(total_rent)
        


In [15]:
df["total_rent"] = df.apply(lambda x:calculate_total_rent(x), axis=1)

In [16]:
df.select_dtypes(include='O').columns

Index(['region', 'subregion', 'city', 'condition', 'type', 'interior_quality',
       'heating_type', 'pets_allowed'],
      dtype='object')

## Handle Booleans

In [17]:
bool_cols = ['newly_constructed', "balcony","kitchen",'lift', 'garden',"pets_allowed"]
num_cols = ['base_rent', 'service_charge', 'total_rent', 'heating_costs', 'floor',
       'floors_num', 'rooms', 'area', 'build_year', 'parking_spaces',
       ]
cat_cols = ['region', 'subregion', 'city', 'condition', 'type', 'interior_quality',
       'heating_type']

In [18]:
def try_dict(key, dictionary):
    key=str(key)
    try:
        output = dictionary[key]
    except:
        output = np.NaN
    return(output)

In [19]:
pets_allowed_dict = {'no':0,
                     'negotiable':1,
                     'yes':1}

In [20]:
bools_dict = {"True":1,
             "False":0}

In [21]:
for column in bool_cols[:-1]:
    df[column] = df[column].apply(lambda x: try_dict(x,bools_dict ))

In [22]:
df.pets_allowed = df.pets_allowed.apply(lambda x: try_dict(x, pets_allowed_dict))

In [23]:
df.type.unique()

array(['ground_floor', 'apartment', 'other', nan, 'roof_storey',
       'raised_ground_floor', 'terraced_flat', 'half_basement',
       'maisonette', 'penthouse', 'loft'], dtype=object)

## Impute missing

In [24]:
df.isna().sum()

base_rent                 0
service_charge         6909
total_rent                0
heating_costs        183332
floor                 51309
floors_num            97732
rooms                     0
area                      0
build_year            57045
parking_spaces       175798
region                    0
subregion                 0
city                      0
condition             68489
type                  36614
interior_quality     112665
heating_type          44856
pets_allowed         114573
newly_constructed         0
balcony                   0
kitchen                   0
lift                      0
garden                    0
dtype: int64

In [25]:
df.parking_spaces.fillna(0,inplace=True)

df.pets_allowed.fillna(0,inplace=True)

In [26]:
imputer_freq_cols = ["floor","floors_num","build_year", "condition", "type","heating_type", "interior_quality" ]

In [27]:
imputer_0_cols = ["parking_spaces", "pets_allowed" ]

In [28]:
imp_0 = SimpleImputer(missing_values=np.nan, strategy="constant",fill_value=0)
imp_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [29]:
%%time
df[imputer_0_cols] = imp_0.fit_transform(df[imputer_0_cols])

Wall time: 95 ms


In [30]:
%%time
df[imputer_freq_cols] = imp_freq.fit_transform(df[imputer_freq_cols ])

Wall time: 5min 52s


In [31]:
df

Unnamed: 0,base_rent,service_charge,total_rent,heating_costs,floor,floors_num,rooms,area,build_year,parking_spaces,...,condition,type,interior_quality,heating_type,pets_allowed,newly_constructed,balcony,kitchen,lift,garden
0,595.0,245.00,840.00,,1.0,3.0,4.0,86.00,1965.0,1.0,...,well_kept,ground_floor,normal,central_heating,0.0,0,0,0,0,1
1,800.0,134.00,934.00,,1.0,3.0,3.0,89.00,1871.0,2.0,...,refurbished,ground_floor,normal,self_contained_central_heating,0.0,0,1,0,0,0
2,965.0,255.00,1300.00,,3.0,4.0,3.0,83.80,2019.0,1.0,...,first_time_use,apartment,sophisticated,floor_heating,0.0,1,1,0,1,0
3,343.0,58.15,488.38,87.23,3.0,3.0,3.0,58.15,1964.0,0.0,...,well_kept,other,normal,district_heating,0.0,0,1,0,0,0
4,765.0,138.00,903.00,,1.0,3.0,3.0,84.97,1950.0,0.0,...,refurbished,apartment,normal,self_contained_central_heating,0.0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268845,820.0,90.00,910.00,,1.0,3.0,3.0,90.00,2016.0,1.0,...,mint_condition,roof_storey,sophisticated,heat_pump,0.0,0,1,0,0,0
268846,930.0,220.00,1150.00,,1.0,1.0,3.5,115.00,1983.0,1.0,...,well_kept,apartment,sophisticated,gas_heating,1.0,0,1,1,0,0
268847,650.0,220.00,930.00,,1.0,2.0,4.0,95.00,1965.0,1.0,...,well_kept,apartment,normal,central_heating,1.0,0,1,0,0,1
268848,840.0,175.00,1015.00,40.00,1.0,5.0,2.0,70.00,2019.0,1.0,...,first_time_use,apartment,sophisticated,heat_pump,0.0,1,1,0,1,0


In [32]:
df.to_csv("German_rentals_transofrmed.csv", index=False)

## Label encoding condition and interior 

In [33]:
df.groupby("condition", as_index=False).total_rent.mean().sort_values(by="total_rent")

Unnamed: 0,condition,total_rent
6,negotiable,643.193152
7,refurbished,690.805382
5,need_of_renovation,694.127821
4,modernized,734.610005
2,fully_renovated,752.330419
9,well_kept,819.081251
8,ripe_for_demolition,912.625
1,first_time_use_after_refurbishment,962.081258
0,first_time_use,1308.926355
3,mint_condition,1901.703602


In [34]:
df.columns

Index(['base_rent', 'service_charge', 'total_rent', 'heating_costs', 'floor',
       'floors_num', 'rooms', 'area', 'build_year', 'parking_spaces', 'region',
       'subregion', 'city', 'condition', 'type', 'interior_quality',
       'heating_type', 'pets_allowed', 'newly_constructed', 'balcony',
       'kitchen', 'lift', 'garden'],
      dtype='object')

In [35]:
interior_dict = {'normal':1,
                 'sophisticated':2,
                 'simple':0, 
                 'luxury':3}

condition_dict = {
                  'first_time_use':5, 
                   'mint_condition':5,    
                   'first_time_use_after_refurbishment':4, 
                   'fully_renovated':4, 
                   'well_kept':3,
                   'modernized':3, 
                   'refurbished':2,
                   'negotiable':1,
                   'need_of_renovation':0, 
                   'ripe_for_demolition':0}


In [36]:
df["condition"] = df.condition.apply(lambda x:condition_dict[x])

In [37]:
df["interior_quality"] = df.interior_quality.apply(lambda x:interior_dict[x])

In [38]:
df.groupby("heating_type").area.count()

heating_type
central_heating                   173833
combined_heat_and_power_plant       1978
district_heating                   24808
electric_heating                     901
floor_heating                      17697
gas_heating                        19955
heat_pump                           2737
night_storage_heater                1341
oil_heating                         5042
self_contained_central_heating     19087
solar_heating                        166
stove_heating                        344
wood_pellet_heating                  961
Name: area, dtype: int64

In [39]:
heating_dict = {'central_heating':'central_heating', 
                'gas_heating':"gas", 
                'self_contained_central_heating':'central_heating',
                'floor_heating':'central_heating',
                'oil_heating':"oil", 
                'district_heating': 'district_heating',
               'night_storage_heater':'central_heating', 
                'heat_pump':"renewables",
               'combined_heat_and_power_plant':'central_heating', 
                'electric_heating':'electric',
               'stove_heating':"solid_fuel", 
                'wood_pellet_heating':"solid_fuel", 
                'solar_heating':"renewables"}

In [40]:
df.heating_type = df.heating_type.apply(lambda x:heating_dict[x])

# One hot encoding

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin

In [42]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_columns):
        self.cat_columns = cat_columns
    
        

    def fit(self, X, y=None):
        self.one_hot_cols = pd.get_dummies(X[self.cat_columns]).columns


        
        return self
    
    def transform(self, X, y=None):
        
        X_one_hot = pd.get_dummies(X[self.cat_columns])
        

        X_tr = pd.concat([X, X_one_hot], axis=1)
        
        
            
        return X_tr

In [43]:
cat_cols.pop(1)
cat_cols.pop(1)

'city'

In [44]:
df.shape

(268850, 23)

In [45]:
cat_cols

['region', 'condition', 'type', 'interior_quality', 'heating_type']

In [46]:
one_hot_cols = ['region','type']

In [47]:
one_hot_enc = OneHotEncoder(one_hot_cols)

In [48]:
df = one_hot_enc.fit_transform(df)

## Rank encoder

In [49]:
region_map = df.groupby("subregion", as_index=False).total_rent.mean()

In [50]:
region_map["subregion_rank"] = region_map.total_rent.rank(ascending=False)

In [51]:
region_map.sort_values(by="total_rent")

Unnamed: 0,subregion,total_rent,subregion_rank
165,Hoyerswerda,415.105878,419.0
377,Uecker_Randow_Kreis,421.928599,418.0
130,Görlitz_Kreis,425.551884,417.0
292,Plauen,429.965155,416.0
406,Wittenberg_Kreis,434.255409,415.0
...,...,...,...
310,Rhein_Neckar_Kreis,2350.380969,5.0
100,Esslingen_Kreis,2675.739593,4.0
7,Altmarkkreis_Salzwedel,2819.210163,3.0
250,Neustadt_an_der_Weinstraße,109577.285517,2.0


In [52]:
df = df.merge(region_map[["subregion", "subregion_rank"]], on="subregion")

In [53]:
df.to_csv("data/German_rentals_model_input.csv")

In [54]:
df.heating_type.unique()

array(['central_heating', 'gas', 'oil', 'district_heating', 'renewables',
       'electric', 'solid_fuel'], dtype=object)

In [84]:
df

Unnamed: 0,base_rent,service_charge,total_rent,heating_costs,floor,floors_num,rooms,area,build_year,parking_spaces,...,type_ground_floor,type_half_basement,type_loft,type_maisonette,type_other,type_penthouse,type_raised_ground_floor,type_roof_storey,type_terraced_flat,subregion_rank
0,595.00,245.0,840.00,,1.0,3.0,4.0,86.00,1965.0,1.0,...,1,0,0,0,0,0,0,0,0,234.0
1,972.60,215.0,1320.65,43.05,0.0,4.0,3.0,87.00,2018.0,1.0,...,1,0,0,0,0,0,0,0,0,234.0
2,396.80,97.0,493.80,,1.0,3.0,2.0,62.00,1958.0,0.0,...,1,0,0,0,0,0,0,0,0,234.0
3,310.00,100.0,460.00,50.00,0.0,2.0,2.0,55.00,1930.0,0.0,...,1,0,0,0,0,0,0,0,0,234.0
4,301.00,60.0,397.00,36.00,3.0,3.0,1.5,30.07,1979.0,1.0,...,0,0,0,0,0,0,0,0,0,234.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268845,430.00,60.0,570.00,80.00,2.0,2.0,3.0,86.00,1939.0,1.0,...,0,0,0,0,0,0,0,1,0,262.0
268846,760.00,50.0,810.00,,1.0,1.0,3.0,90.00,2019.0,1.0,...,0,0,0,0,0,0,0,0,0,262.0
268847,760.00,50.0,880.00,50.00,1.0,1.0,3.0,90.00,2019.0,1.0,...,0,0,0,0,0,0,0,0,0,262.0
268848,365.00,120.0,485.00,,1.0,2.0,2.0,66.00,2019.0,1.0,...,0,0,0,0,0,0,0,1,0,262.0


In [55]:
df.condition.unique()

array([3, 5, 1, 4, 2, 0], dtype=int64)

## Selecting model data

In [56]:
drop_cols = ['base_rent', 'service_charge', 'total_rent', 'heating_costs',"subregion","city"] + cat_cols

In [112]:
df = df.query("total_rent>100 and total_rent<2000")

In [131]:
df.to_csv("data/German_rentals_model_input.csv", index=False)

In [113]:
X = df.drop(columns = drop_cols )

In [114]:
X.shape

(259200, 39)

In [129]:
df.columns

Index(['base_rent', 'service_charge', 'total_rent', 'heating_costs', 'floor',
       'floors_num', 'rooms', 'area', 'build_year', 'parking_spaces', 'region',
       'subregion', 'city', 'condition', 'type', 'interior_quality',
       'heating_type', 'pets_allowed', 'newly_constructed', 'balcony',
       'kitchen', 'lift', 'garden', 'region_Baden_Württemberg',
       'region_Bayern', 'region_Berlin', 'region_Brandenburg', 'region_Bremen',
       'region_Hamburg', 'region_Hessen', 'region_Mecklenburg_Vorpommern',
       'region_Niedersachsen', 'region_Nordrhein_Westfalen',
       'region_Rheinland_Pfalz', 'region_Saarland', 'region_Sachsen',
       'region_Sachsen_Anhalt', 'region_Schleswig_Holstein',
       'region_Thüringen', 'type_apartment', 'type_ground_floor',
       'type_half_basement', 'type_loft', 'type_maisonette', 'type_other',
       'type_penthouse', 'type_raised_ground_floor', 'type_roof_storey',
       'type_terraced_flat', 'subregion_rank'],
      dtype='object')

In [116]:
y = df.total_rent

In [117]:
#y = np.asarray(y).reshape(-1,1)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [119]:
X_test.reset_index(drop=True,inplace=True)
X_train.reset_index(drop=True,inplace=True)

In [120]:
y_train

136836    1010.00
6655       505.00
194360     425.00
256085     450.00
257887     550.00
           ...   
268828     607.00
109999     337.00
139253     960.00
154384     517.59
128832     510.00
Name: total_rent, Length: 207360, dtype: float64

# Initial model

In [121]:
rfr = RandomForestRegressor(verbose=True, n_estimators=50)

In [122]:
rfr.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.5min finished


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=None, verbose=True, warm_start=False)

In [123]:
rfr.score(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    5.1s finished


0.9816467501764679

In [124]:
rfr.score(X_test.values, y_test.values)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.2s finished


0.8785751489546832

In [125]:
df.total_rent

0          840.00
1         1320.65
2          493.80
3          460.00
4          397.00
           ...   
268845     570.00
268846     810.00
268847     880.00
268848     485.00
268849     506.36
Name: total_rent, Length: 259200, dtype: float64

In [126]:
from sklearn.metrics import r2_score

In [100]:
y_pred = rfr.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.3s finished


In [101]:
r2_score(y_pred, y_test)

-0.0004267064414489319

In [102]:
rfr

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=None, verbose=True, warm_start=False)

In [103]:
params = {"bootstrap":True, "ccp_alpha":1.0, "max_depth":5, "max_features":5}

In [104]:
rfr.set_params(**params)

RandomForestRegressor(bootstrap=True, ccp_alpha=1.0, criterion='mse',
                      max_depth=5, max_features=5, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=50, n_jobs=None, oob_score=False,
                      random_state=None, verbose=True, warm_start=False)

# Hyper parameter optimization

In [76]:
def train_evaluate_CatBoost(X_train, y_train, X_test, y_test, params):
    model =  model.set_params(**params)
    model.fit(X_train, y_train, eval_set = (X_test.values, y_test), cat_features = cat_cols,)
    score = calculate_rmse(model, X_test, y_test)
    print(score)
    return score