In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, sys
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error
from src.preprocessing import clean_data
from src.baseline_model import train_baseline_model, evaluate_model



# Detect current working directory and move up to project root if needed
project_root = os.path.abspath("..") 
if project_root not in sys.path:
    sys.path.append(project_root)

df = pd.read_csv("../data/train.csv")
print("✅ Loaded dataset:", df.shape)

df_clean = clean_data(df)

✅ Loaded dataset: (346, 50)
Dataframe before cleaning:  (346, 50)
Cleaning Data....
Only regarding livingSpace > 10 m² & < 1000m² Shape:  (343, 50)
Missing Values serviceCharge 19
Missing values serviceCharge after filling: 0
Removing columns with to many NaNs
Dataframe nach dem Cleanen:  (343, 30)


In [40]:
results = []

for col in df_clean:
    if col == 'totalRent': continue

    df_tmp = df_clean.dropna(subset=[col, "totalRent"])
    X = df_tmp[[col]].to_numpy()
    y = df_tmp['totalRent'].to_numpy()
    model_full = LinearRegression().fit(X, y)
    r2_full = r2_score(y, model_full.predict(X))
    results.append((col, r2_full))

results.sort(key=lambda x: x[1], reverse=True)
print(f"{'Feature':<25} | {'R² Score':<10}")
print("-" * 40)
for col, score in results:
    print(f"{col:<25} | {score:.4f}")


Feature                   | R² Score  
----------------------------------------
baseRent                  | 0.9783
baseRentRange             | 0.8053
livingSpace               | 0.7847
americanArea              | 0.7847
livingSpaceRange          | 0.7477
serviceCharge             | 0.5736
noRooms                   | 0.5729
noRoomsRange              | 0.5506
picturecount              | 0.1471
condition_num             | 0.1053
balcony_num               | 0.0913
yearConstructedRange      | 0.0494
thermalChar               | 0.0428
newlyConst_num            | 0.0394
geo_plz                   | 0.0321
cellar_num                | 0.0213
lift_num                  | 0.0197
garden_num                | 0.0191
hasKitchen_num            | 0.0179
lastRefurbish             | 0.0149
numberOfFloors            | 0.0112
scoutId                   | 0.0107
yearConstructed           | 0.0075
floor                     | 0.0065
heatingType_num           | 0.0052
interiorQual_num          | 0.0036
telekomUpl

Iteratives Hinzufügen von Features in Regressionsmodell
Es wird jeweils das Feature genutzt welches im den größten neuen Mehrwert bringt

In [36]:
feature_cols_test = ['livingSpace','picturecount']
target_col = 'totalRent'

df_tmp_test = df_clean.dropna(subset=feature_cols_test + [target_col])
X_train_test = df_clean[feature_cols_test].values
y_train_test = df_clean[target_col].values

model_multi_test = LinearRegression()
model_multi_test.fit(X_train_test, y_train_test)

y_train_pred_test = model_multi_test.predict(X_train_test)
train_rmse_test = np.sqrt(root_mean_squared_error(y_train_test, y_train_pred_test))
train_r2_test = r2_score(y_train_test, y_train_pred_test)

print(f"R²: {train_r2_test}")
print(f"RMSE: {train_rmse_test}")


R²: 0.7855157549577536
RMSE: 15.381831798535426


In [48]:
feature_cols = ['livingSpace']
possible_features = []
for col_name, score in results:
    if col_name in ['livingSpace','serviceCharge','baseRent']:
        print(f"{col_name} already in feature_cols or forbidden")
        continue

    feature_cols_tmp = feature_cols + [col_name]
    df_tmp = df_clean.dropna(subset=feature_cols_tmp + [target_col])
    X_train = df_clean[feature_cols_tmp].values
    y_train = df_clean[target_col].values

    model_multi = LinearRegression()
    model_multi.fit(X_train, y_train)

    y_train_pred = model_multi.predict(X_train)
    train_rmse = np.sqrt(root_mean_squared_error(y_train, y_train_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    possible_features.append((col_name, train_r2, train_rmse))

possible_features.sort(key=lambda x: x[1], reverse=True)
print(f"{'Feature':<25} | {'R² Score':<10} | {'RMSE Score' :<4}")
print("-" * 50)
for col_name, score_r2, score_rmse in possible_features:
    print(f"{col_name:<25} | {score_r2:.4f}     | {score_rmse:.4f}")
    
    
    
    
    


baseRent already in feature_cols or forbidden
livingSpace already in feature_cols or forbidden
serviceCharge already in feature_cols or forbidden
Feature                   | R² Score   | RMSE Score
--------------------------------------------------
baseRentRange             | 0.8580     | 13.8761
condition_num             | 0.8173     | 14.7763
yearConstructedRange      | 0.8097     | 14.9290
lift_num                  | 0.8067     | 14.9867
newlyConst_num            | 0.8026     | 15.0652
thermalChar               | 0.8025     | 15.0670
yearConstructed           | 0.7982     | 15.1493
geo_plz                   | 0.7951     | 15.2064
lastRefurbish             | 0.7920     | 15.2637
scoutId                   | 0.7906     | 15.2904
noRoomsRange              | 0.7890     | 15.3193
balcony_num               | 0.7888     | 15.3222
noRooms                   | 0.7879     | 15.3395
hasKitchen_num            | 0.7872     | 15.3518
numberOfFloors            | 0.7864     | 15.3651
heatingType_num 