<a href="https://colab.research.google.com/github/MCannas/BusinessIntelligence/blob/master/1_8_3_Kalifornien_Hauspreise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os # u.a. zur Entwicklugn plattformübergreifender Systempfade
import pandas as pd # Datenmanagement
import numpy as np # Hilfsfunktionen für mathematische Operationen

# Datenvisualisierung
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split # Datensplits
from sklearn.linear_model import LinearRegression # Machine Learning
from sklearn import metrics # Modellevaluierung

## eigene Funktionen
def filter_df_by_proximity(df, proximity):
    return df.loc[df["ocean_proximity"] == proximity]

def engineer_features(df):
    df["ratio_bedrooms"] = df["total_bedrooms"] / df["total_rooms"]
    df["people_per_household"] = df["population"] / df["households"]
    return df

In [None]:
import os
import tarfile
import urllib.request
import shutil
import requests

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.getcwd()
CALIFORNIA_URL = "https://raw.githubusercontent.com/christianwarmuth/openhpi-kipraxis/main/images/california.png"
CALIFORNIA_PATH = "california.png"
FILE_PATH = "housing.csv"
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

response = requests.get(CALIFORNIA_URL, stream=True)
with open(CALIFORNIA_PATH, 'wb') as out_file:
    shutil.copyfileobj(response.raw, out_file)
del response

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

fetch_housing_data()

df = pd.read_csv(FILE_PATH) # Wir lesen die Datei housing.csv ein

df = df.dropna() # löscht alle Zeile mit fehlenden Attributen
df = df.reset_index(drop=True) # zählt unsere Daten neu durch

description = df.describe()

bins = [0] + list(description["median_house_value"][
    ["25%", "50%", "75%"]
].astype(int)) + [np.inf]

df["house_cat"] = pd.cut(
    df["median_house_value"],
    bins=bins,
    labels=["0 - 25%", "25 - 50%", "50 - 75%", "75 - 100%"]
)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
for train_index, test_index in split.split(df, df["house_cat"]):
    df_train = df.loc[train_index]
    df_test = df.loc[test_index]

df_train = df_train.drop("house_cat", axis=1)
df_test = df_test.drop("house_cat", axis=1)

df_train = df_train.drop(filter_df_by_proximity(df_train, "ISLAND").index)
df_test = df_test.drop(filter_df_by_proximity(df_test, "ISLAND").index)

df_train = engineer_features(df_train)
df_test = engineer_features(df_test)

In [None]:
df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,ratio_bedrooms,people_per_household
11242,-117.93,33.74,15.0,1206.0,282.0,677.0,270.0,3.9219,142600.0,<1H OCEAN,0.233831,2.507407
987,-121.72,37.7,17.0,1671.0,352.0,729.0,252.0,6.1023,450000.0,INLAND,0.210652,2.892857
8052,-118.11,33.83,36.0,1462.0,233.0,664.0,220.0,5.1171,225300.0,<1H OCEAN,0.159371,3.018182
6699,-118.15,34.11,52.0,1746.0,330.0,704.0,306.0,3.7895,364800.0,<1H OCEAN,0.189003,2.300654
253,-122.21,37.77,52.0,745.0,153.0,473.0,149.0,2.6765,88800.0,NEAR BAY,0.205369,3.174497


In [None]:
df_train_ml = pd.get_dummies(df_train) # One-Hot Encoding
df_test_ml = pd.get_dummies(df_test)

In [None]:
df_train_ml.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ratio_bedrooms,people_per_household,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
11242,-117.93,33.74,15.0,1206.0,282.0,677.0,270.0,3.9219,142600.0,0.233831,2.507407,True,False,False,False
987,-121.72,37.7,17.0,1671.0,352.0,729.0,252.0,6.1023,450000.0,0.210652,2.892857,False,True,False,False
8052,-118.11,33.83,36.0,1462.0,233.0,664.0,220.0,5.1171,225300.0,0.159371,3.018182,True,False,False,False
6699,-118.15,34.11,52.0,1746.0,330.0,704.0,306.0,3.7895,364800.0,0.189003,2.300654,True,False,False,False
253,-122.21,37.77,52.0,745.0,153.0,473.0,149.0,2.6765,88800.0,0.205369,3.174497,False,False,True,False


In [None]:
def get_features_and_targets(df):
    X = df.drop(["median_house_value"], axis=1).values
    y = np.stack(df["median_house_value"])
    return X, y

In [None]:
X_train, y_train = get_features_and_targets(df_train_ml)
X_test, y_test = get_features_and_targets(df_test_ml)

In [None]:
clf = LinearRegression()
clf.fit(X_train, y_train)