## Leistungsnachweis

In [33]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import joblib
import gradio as gr


# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load data

# Read the data to a pandas data frame
df = pd.read_csv('apartments_data_enriched_lat_lon_combined.csv', sep=',', encoding='utf-8')

# Data cleaning

# Remove missing values
df = df.dropna()

# Remove duplicates
df = df.drop_duplicates()

# Remove some 'extreme' values
df = df.loc[(df['price'] >= 750) & 
            (df['price'] <= 8000)]

df.columns


Index(['bfs_number', 'rooms', 'area', 'price', 'postalcode', 'address', 'town',
       'description_raw', 'bfs_name', 'pop', 'pop_dens', 'frg_pct', 'emp',
       'tax_income', 'lat', 'lon', 'x', 'y'],
      dtype='object')

### Model Performance

In [34]:
def model_performance(features, df, random_forest_model = RandomForestRegressor(random_state=42)):
    df = df.sample(frac=1, random_state=42)
    X, y = df[features], df['price']
    scores = cross_val_score(random_forest_model, X, y, scoring="neg_root_mean_squared_error", cv=5)
    print('CV results RMSE:', np.round(scores))
    print('Mean RMSE:', np.mean(np.round(scores, 0)))

features = ['rooms', 'area', 'postalcode','pop', 'pop_dens', 'tax_income']
model_performance(features, df)

CV results RMSE: [-602. -579. -629. -629. -574.]
Mean RMSE: -602.6


### Feature Engineering

In [35]:
# Feature engineering

df['density_factor'] = np.log1p(df['pop_dens']) * df['rooms']

features = ['rooms', 'area', 'postalcode','pop', 'pop_dens', 'tax_income','density_factor']
model_performance(features, df)

print(df.columns)

CV results RMSE: [-618. -576. -663. -629. -559.]
Mean RMSE: -609.0
Index(['bfs_number', 'rooms', 'area', 'price', 'postalcode', 'address', 'town',
       'description_raw', 'bfs_name', 'pop', 'pop_dens', 'frg_pct', 'emp',
       'tax_income', 'lat', 'lon', 'x', 'y', 'density_factor'],
      dtype='object')


### Train Model

In [36]:
features = ['rooms', 'area', 'postalcode','pop', 'pop_dens', 'tax_income','density_factor']
X = df[features]
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, "apartment_price_model.pkl")

['apartment_price_model.pkl']

### Gradio Web Interface

In [37]:
def predict_price(rooms, area, postalcode, pop, pop_dens, tax_income, density_factor):
    input_data = pd.DataFrame([[rooms, area, postalcode, pop, pop_dens, tax_income, density_factor]],
                              columns=features)
    price = model.predict(input_data)[0]
    return f"Geschätzter Preis: {price:.2f} CHF"

In [38]:
# Interface
app = gr.Interface(
    fn=predict_price,
    inputs=[
        gr.Number(label="Anzahl Zimmer"),
        gr.Number(label="Fläche (qm)"),
        gr.Number(label="Postleitzahl"),
    ],
    outputs="text",
    title="Apartment Preis Schätzer",
    description="Geben Sie die Anzahl der Zimmer, die Fläche und die Postleitzahl ein. Bevölkerungsdaten werden automatisch abgerufen und density_factor berechnet.",
    examples = [[2, 122, 8050],
            [1.5, 30, 8008]]
)


# Launch the Gradio app
app.launch(share=True)


!jupyter nbconvert --to script feature_engineering.ipynb
    



* Running on local URL:  http://127.0.0.1:7867
* Running on public URL: https://cea133685c98ccf378.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


[NbConvertApp] Converting notebook feature_engineering.ipynb to script
[NbConvertApp] Writing 3106 bytes to feature_engineering.py


In [39]:
import pickle

model_filename = "feature_engineering.pkl"
with open(model_filename, mode="bw") as f:
    pickle.dump(model, file=f)