In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error
project_root = os.path.abspath("..") 
if project_root not in sys.path:
    sys.path.append(project_root)
from src.preprocessing import clean_data
from src.baseline_model import train_baseline_model, evaluate_model



# Detect current working directory and move up to project root if needed
project_root = os.path.abspath("..") 
if project_root not in sys.path:
    sys.path.append(project_root)

df = pd.read_csv("../data/train.csv")
print("✅ Loaded dataset:", df.shape)

df_clean = clean_data(df)

✅ Loaded dataset: (346, 50)
Dataframe before cleaning:  (346, 50)
Cleaning Data....
Only regarding livingSpace > 10 m² & < 1000m² Shape:  (343, 50)
Missing Values serviceCharge 19
Missing values serviceCharge after filling: 0
Removing columns with to many NaNs
Dataframe nach dem Cleanen:  (343, 30)


In [4]:
results = []

for col in df_clean:
    if col == 'totalRent': continue

    df_tmp = df_clean.dropna(subset=[col, "totalRent"])
    X = df_tmp[[col]].to_numpy()
    y = df_tmp['totalRent'].to_numpy()
    model_full = LinearRegression().fit(X, y)
    r2_full = r2_score(y, model_full.predict(X))
    results.append((col, r2_full))

results.sort(key=lambda x: x[1], reverse=True)
print(f"{'Feature':<25} | {'R² Score':<10}")
print("-" * 40)
for col, score in results:
    print(f"{col:<25} | {score:.4f}")


Feature                   | R² Score  
----------------------------------------
baseRent                  | 0.9783
baseRentRange             | 0.8053
livingSpace               | 0.7847
americanArea              | 0.7847
livingSpaceRange          | 0.7477
serviceCharge             | 0.5736
noRooms                   | 0.5729
noRoomsRange              | 0.5506
picturecount              | 0.1471
condition_num             | 0.1053
balcony_num               | 0.0913
yearConstructedRange      | 0.0494
thermalChar               | 0.0428
newlyConst_num            | 0.0394
geo_plz                   | 0.0321
cellar_num                | 0.0213
lift_num                  | 0.0197
garden_num                | 0.0191
hasKitchen_num            | 0.0179
lastRefurbish             | 0.0149
numberOfFloors            | 0.0112
scoutId                   | 0.0107
yearConstructed           | 0.0075
floor                     | 0.0065
heatingType_num           | 0.0052
interiorQual_num          | 0.0036
telekomUpl

Iteratives Hinzufügen von Features in Regressionsmodell
Es wird jeweils das Feature genutzt welches im den größten neuen Mehrwert bringt

In [5]:
feature_cols_test = ['livingSpace','picturecount']
target_col = 'totalRent'

df_tmp_test = df_clean.dropna(subset=feature_cols_test + [target_col])
X_train_test = df_tmp_test[feature_cols_test].values
y_train_test = df_tmp_test[target_col].values

model_multi_test = LinearRegression()
model_multi_test.fit(X_train_test, y_train_test)

y_train_pred_test = model_multi_test.predict(X_train_test)
train_rmse_test = np.sqrt(root_mean_squared_error(y_train_test, y_train_pred_test))
train_r2_test = r2_score(y_train_test, y_train_pred_test)

print(f"R²: {train_r2_test}")
print(f"RMSE: {train_rmse_test}")


R²: 0.7855157549577536
RMSE: 15.381831798535426


In [28]:
def make_feature_col(feature_cols):
    
    possible_features = []
    for col_name, score in results:
        if col_name in ['serviceCharge','baseRent'] + feature_cols:
            #print(f"{col_name} already in feature_cols or forbidden")
            continue
        else:       
            feature_cols_tmp = feature_cols + [col_name]
            df_tmp = df_clean.dropna(subset=feature_cols_tmp + [target_col])
            X_train = df_tmp[feature_cols_tmp].values
            y_train = df_tmp[target_col].values

            model_multi = LinearRegression()
            model_multi.fit(X_train, y_train)

            y_train_pred = model_multi.predict(X_train)
            train_rmse = np.sqrt(root_mean_squared_error(y_train, y_train_pred))
            train_r2 = r2_score(y_train, y_train_pred)
            possible_features.append((col_name, train_r2, train_rmse))

    possible_features.sort(key=lambda x: x[1], reverse=True)
    #print(f"{'Feature':<25} | {'R² Score':<10} | {'RMSE Score' :<4}")
    #print("-" * 50)
    #for col_name, score_r2, score_rmse in possible_features[:10]:
    #    print(f"{col_name:<25} | {score_r2:.4f}     | {score_rmse:.4f}")
    return possible_features[0]   
    


In [32]:
feature_cols = ['livingSpace']
for i in range (25):
    new_feature = make_feature_col(feature_cols)
    feature_cols.append(new_feature[0])
    print(f"Added feature {new_feature[0]}, all R^2: {new_feature[1]}, all RMSE: {new_feature[2]}")

for col in feature_cols:
    print({col})    

Added feature baseRentRange, all R^2: 0.857954176667856, all RMSE: 13.876065013155944
Added feature lift_num, all R^2: 0.8657489599324728, all RMSE: 13.681654642451512
Added feature geo_plz, all R^2: 0.8712855683778247, all RMSE: 13.53835879656765
Added feature lastRefurbish, all R^2: 0.8748098365848186, all RMSE: 13.444719951043654
Added feature interiorQual_num, all R^2: 0.8779834903174464, all RMSE: 13.358689584910215
Added feature noRoomsRange, all R^2: 0.8807997833332172, all RMSE: 13.280929461911766
Added feature scoutId, all R^2: 0.8832871604982209, all RMSE: 13.211096584630342
Added feature newlyConst_num, all R^2: 0.8850267907225208, all RMSE: 13.161590440412672
Added feature heatingType_num, all R^2: 0.8861803776260588, all RMSE: 13.128451193385121
Added feature thermalChar, all R^2: 0.8870283897967419, all RMSE: 13.103929219146686
Added feature cellar_num, all R^2: 0.8875536574189886, all RMSE: 13.088670731666273
Added feature noParkSpaces, all R^2: 0.8881106901180315, all R