In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold

# Lade dein Dataset
data = pd.read_csv('Housing.csv')

# Kodieren der kategorischen Merkmale in numerische Werte
label_encoder = LabelEncoder()
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Splitte das Dataset in Features (X) und Target (y)
X = data.drop('price', axis=1)  # Features (alle Spalten außer 'price')
y = data['price']  # Zielvariable (Preis)

# Definiere die Regressionsmodelle, die du testen möchtest
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(max_depth=5, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
}

# 5-fache Kreuzvalidierung
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Ergebnisse speichern
results = {model_name: {'mse': [], 'mae': [], 'r2': []} for model_name in models.keys()}

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        r_squared = 1 - mse / np.var(y_test)
        
        results[model_name]['mse'].append(mse)
        results[model_name]['mae'].append(mae)
        results[model_name]['r2'].append(r_squared)

# Durchschnittliche Fehlermaße über die 5 Folds berechnen
for model_name in models.keys():
    avg_mse = np.mean(results[model_name]['mse'])
    avg_mae = np.mean(results[model_name]['mae'])
    avg_r2 = np.mean(results[model_name]['r2'])
    
    print(f"{model_name} - Mean Absolute Error: {avg_mae}")
    print(f"{model_name} - Mean Squared Error: {avg_mse}")
    print(f"{model_name} - R^2: {avg_r2}\n")


Linear Regression - Mean Absolute Error: 806130.940497786
Linear Regression - Mean Squared Error: 1200394629952.1387
Linear Regression - R^2: 0.6318615631007681

Decision Tree - Mean Absolute Error: 1000125.1070078366
Decision Tree - Mean Squared Error: 2008150667497.0679
Decision Tree - R^2: 0.3573342600434061

Random Forest - Mean Absolute Error: 821905.4477716297
Random Forest - Mean Squared Error: 1312101748375.1194
Random Forest - R^2: 0.6083633997508636

Gradient Boosting - Mean Absolute Error: 826092.7293674946
Gradient Boosting - Mean Squared Error: 1364167975030.7717
Gradient Boosting - R^2: 0.585185711602817



In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor

# Lade dein Dataset
data = pd.read_csv('Housing.csv')

# Kodieren der kategorischen Merkmale in numerische Werte
label_encoder = LabelEncoder()
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Erstelle eine Preis-Kategorie für gleichmäßige Verteilung in den Folds
data['price_category'] = pd.qcut(data['price'], q=5, labels=False)

# Splitte das Dataset in Features (X) und Target (y)
X = data.drop(['price', 'price_category'], axis=1)  # Features (alle Spalten außer 'price' und 'price_category')
y = data['price']  # Zielvariable (Preis)

# Wende den MinMaxScaler an
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)  # Umwandlung zurück in DataFrame

# Definiere die Regressionsmodelle, die du testen möchtest
lr = LinearRegression()
lasso = Lasso(alpha=1.0)  # L1-Regularisierung
ridge = Ridge(alpha=1.0)  # L2-Regularisierung
dt = DecisionTreeRegressor(max_depth=4, random_state=42)
rf = RandomForestRegressor(n_estimators=100, max_depth=4, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, max_depth=4, random_state=42)
xgb = XGBRegressor(n_estimators=100, max_depth=4, random_state=42)

models = {
    'Linear Regression': lr,
    'Lasso Regression': lasso,
    'Ridge Regression': ridge,
    'Decision Tree': dt,
    'Random Forest': rf,
    'Gradient Boosting': gb,
    'XGBoost': xgb,
}

# Erstelle den Voting Regressor
voting_regressor = VotingRegressor(estimators=[
    ('Linear Regression', lr),
    ('Lasso Regression', lasso),
    ('Ridge Regression', ridge),
    ('Decision Tree', dt),
    ('Random Forest', rf),
    ('Gradient Boosting', gb),
    ('XGBoost', xgb)
])

# 5-fache stratifizierte Kreuzvalidierung
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Ergebnisse speichern
results = {model_name: {'mse': [], 'mae': [], 'r2': []} for model_name in models.keys()}

for train_index, test_index in kf.split(X, data['price_category']):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        r_squared = 1 - mse / np.var(y_test)
        
        results[model_name]['mse'].append(mse)
        results[model_name]['mae'].append(mae)
        results[model_name]['r2'].append(r_squared)

# Durchschnittliche Fehlermaße über die 5 Folds berechnen
for model_name in models.keys():
    avg_mse = np.mean(results[model_name]['mse'])
    avg_mae = np.mean(results[model_name]['mae'])
    avg_r2 = np.mean(results[model_name]['r2'])
    
    print(f"{model_name} - Mean Absolute Error: {avg_mae}")
    print(f"{model_name} - Mean Squared Error: {avg_mse}")
    print(f"{model_name} - R^2: {avg_r2}\n")


Linear Regression - Mean Absolute Error: 808096.4059761824
Linear Regression - Mean Squared Error: 1212669771709.4429
Linear Regression - R^2: 0.6477055410558171

Lasso Regression - Mean Absolute Error: 808095.3086202976
Lasso Regression - Mean Squared Error: 1212670138510.1226
Lasso Regression - R^2: 0.6477055505104865

Ridge Regression - Mean Absolute Error: 805920.2556641481
Ridge Regression - Mean Squared Error: 1212675827805.236
Ridge Regression - R^2: 0.6481755557505029

Decision Tree - Mean Absolute Error: 1032638.5670892993
Decision Tree - Mean Squared Error: 2112409826045.3188
Decision Tree - R^2: 0.38296932218121615

Random Forest - Mean Absolute Error: 877362.5366530225
Random Forest - Mean Squared Error: 1504393171156.2156
Random Forest - R^2: 0.5691086780831913

Gradient Boosting - Mean Absolute Error: 813776.2998545977
Gradient Boosting - Mean Squared Error: 1366947096900.139
Gradient Boosting - R^2: 0.602941708376014

XGBoost - Mean Absolute Error: 851684.425
XGBoost - M

In [29]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import LeaveOneOut
from xgboost import XGBRegressor

# Lade dein Dataset
data = pd.read_csv('Housing.csv')

# Kodieren der kategorischen Merkmale in numerische Werte
label_encoder = LabelEncoder()
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# Splitte das Dataset in Features (X) und Target (y)
X = data.drop(['price'], axis=1)  # Features (alle Spalten außer 'price')
y = data['price']  # Zielvariable (Preis)

# Wende den MinMaxScaler an
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)  # Umwandlung zurück in DataFrame

# Definiere die Regressionsmodelle, die du testen möchtest
lr = LinearRegression()
lasso = Lasso(alpha=1.0)  # L1-Regularisierung
ridge = Ridge(alpha=1.0)  # L2-Regularisierung
dt = DecisionTreeRegressor(max_depth=5, random_state=42)
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
gb = GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42)
xgb = XGBRegressor(n_estimators=200, max_depth=5, random_state=42)

models = {
    'Linear Regression': lr,
    'Lasso Regression': lasso,
    'Ridge Regression': ridge,
    'Decision Tree': dt,
    'Random Forest': rf,
    'Gradient Boosting': gb,
    'XGBoost': xgb,
}

# Erstelle den Voting Regressor
voting_regressor = VotingRegressor(estimators=[
    ('Linear Regression', lr),
    ('Lasso Regression', lasso),
    ('Ridge Regression', ridge),
    ('Decision Tree', dt),
    ('Random Forest', rf),
    ('Gradient Boosting', gb),
    ('XGBoost', xgb)
])

# Leave-One-Out-Cross-Validation (LOOCV)
loo = LeaveOneOut()

# Ergebnisse speichern
results = {model_name: {'mse': [], 'mae': [], 'y_true': [], 'y_pred': []} for model_name in models.keys()}

for train_index, test_index in loo.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        
        results[model_name]['mse'].append(mse)
        results[model_name]['mae'].append(mae)
        results[model_name]['y_true'].append(y_test.values[0])
        results[model_name]['y_pred'].append(predictions[0])

# Durchschnittliche Fehlermaße über die LOOCV-Folds berechnen
for model_name in models.keys():
    avg_mse = np.mean(results[model_name]['mse'])
    avg_mae = np.mean(results[model_name]['mae'])
    overall_r2 = r2_score(results[model_name]['y_true'], results[model_name]['y_pred'])
    
    print(f"{model_name} - Mean Absolute Error: {avg_mae}")
    print(f"{model_name} - Mean Squared Error: {avg_mse}")
    print(f"{model_name} - R^2: {overall_r2}\n")


Linear Regression - Mean Absolute Error: 797403.6997686285
Linear Regression - Mean Squared Error: 1183730297226.8325
Linear Regression - R^2: 0.6610286589435679

Lasso Regression - Mean Absolute Error: 797402.5796475348
Lasso Regression - Mean Squared Error: 1183730395453.0757
Lasso Regression - R^2: 0.6610286308156392

Ridge Regression - Mean Absolute Error: 796105.3804100065
Ridge Regression - Mean Squared Error: 1184196397107.799
Ridge Regression - R^2: 0.6608951872379879

Decision Tree - Mean Absolute Error: 1104328.495696463
Decision Tree - Mean Squared Error: 2481925356076.1963
Decision Tree - R^2: 0.28927934993126514

Random Forest - Mean Absolute Error: 837203.650462113
Random Forest - Mean Squared Error: 1388401822169.2297
Random Forest - R^2: 0.6024192092671312

Gradient Boosting - Mean Absolute Error: 803751.3722534117
Gradient Boosting - Mean Squared Error: 1341249133114.1177
Gradient Boosting - R^2: 0.615921786907387

XGBoost - Mean Absolute Error: 846269.0211009175
XGBoo