In [14]:
from modules.datakit import *
import statsmodels.formula.api as smf
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error

__Read data__

In [17]:
path = f'data_processed/modeling_data.csv'
df = pd.read_csv(path)
df = df[df.added_dt.le('2025-02-04')]

# Input missing values
input_missing_values(df)

# One-Hot Encoding
df_encoded = pd.get_dummies(
    df,
    columns=['heating', 'building_type', 'apartment_class'],
    prefix={'heating': 'heating', 'building_type': 'bt', 'apartment_class': 'ac'},
    prefix_sep='_',
    dtype=int)

district_dummies = pd.get_dummies(df['district'], prefix='', prefix_sep='', dtype=int)
df = pd.concat([df_encoded.drop('district', axis=1), district_dummies], axis=1)

# Calculate rental price along with additional fees
df['rent'] = df['rent'] + df['additional_fees']
df.drop(columns=['additional_fees'], inplace=True)

# Data adjusments
df['log_rent'] = np.log(df['rent'])
df.loc[df['avg_price'].ge(200), 'avg_price'] = 0
df['avg_price'] = df['avg_price']
df['distance_to_center_le_1km'] = df['distance_to_center'].le(1).astype(int)
df['avg_price_ge_100'] = df['avg_price'].ge(100).astype(int)
df['area_le_60']= df['area'].le(60).astype(int)
df['area_gt_100m2'] = df['area'].ge(100).astype(int)

# Split data into train and out of time sample
train_df = df[df.added_dt.le('2025-01-25')].copy()
out_of_time_sample = df[df.added_dt.ge('2025-01-26')].copy()

__Build models__

In [20]:
formula = (
    'log_rent ~ area + room_number + terrace + separate_kitchen +'
    'utility_room + building_age + air_conditioning + distance_to_subway +'
    'distance_to_center + avg_price + bt_tenement + ac_basic +'
    'ac_premium + ac_superior + Targowek + Wilanow +'
    'ac_premium:distance_to_center_le_1km + ac_superior:distance_to_center_le_1km +'
    'area:avg_price_ge_100 + area:avg_price_ge_100:area_le_60 +'
    'building_age:area_gt_100m2'
)

model = smf.ols(formula=formula, data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,log_rent,R-squared:,0.853
Model:,OLS,Adj. R-squared:,0.852
Method:,Least Squares,F-statistic:,990.2
Date:,"Thu, 22 May 2025",Prob (F-statistic):,0.0
Time:,21:20:32,Log-Likelihood:,1616.6
No. Observations:,3610,AIC:,-3189.0
Df Residuals:,3588,BIC:,-3053.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,7.7328,0.028,276.811,0.000,7.678,7.788
area,0.0057,0.000,28.521,0.000,0.005,0.006
room_number,0.1351,0.005,25.677,0.000,0.125,0.145
terrace,0.0432,0.008,5.482,0.000,0.028,0.059
separate_kitchen,-0.0362,0.007,-5.552,0.000,-0.049,-0.023
utility_room,0.0091,0.009,1.015,0.310,-0.008,0.027
building_age,-0.0014,0.000,-10.472,0.000,-0.002,-0.001
air_conditioning,0.0700,0.007,10.533,0.000,0.057,0.083
distance_to_subway,-0.0120,0.002,-6.747,0.000,-0.016,-0.009

0,1,2,3
Omnibus:,386.888,Durbin-Watson:,1.914
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2847.868
Skew:,-0.216,Prob(JB):,0.0
Kurtosis:,7.33,Cond. No.,2070.0


__Out of time performance__

In [22]:
y_oot = out_of_time_sample['rent']
y_pred_oot = np.exp(model.predict(out_of_time_sample))

In [24]:
mae_oot = mean_absolute_error(y_oot, y_pred_oot)
r2_oot = r2_score(y_oot, y_pred_oot)
mape_oot = mean_absolute_percentage_error(y_oot, y_pred_oot)

print("Out-of-Time Sample Scores:")
print("MAE:", mae_oot)
print("R2:", r2_oot)
print("MAPE:", mape_oot)

Out-of-Time Sample Scores:
MAE: 620.6620603016784
R2: 0.83085038497843
MAPE: 0.12552074413888092
