In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
df = pd.read_csv('housing.csv')
df.dtypes

In [None]:
df.dropna(inplace=True)
df['ocean_proximity'] = LabelEncoder().fit_transform(df['ocean_proximity'])
df.head()

In [None]:
def Normalize(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame:
    dataframe[columns] = (dataframe[columns] - dataframe[columns].min()) / (dataframe[columns].max() - dataframe[columns].min())
    return dataframe

def Discretize(dataframe: pd.DataFrame, columns: list) -> pd.DataFrame:
    for col in columns:
        dataframe[col] = pd.cut(dataframe[col], bins=10, labels=False)
    return dataframe

# df = Discretize(df, [
#     'longitude', 'latitude', 'housing_median_age', 'total_rooms',
#     'total_bedrooms', 'population', 'households', 'median_income',
#     'ocean_proximity','median_house_value'
# ])
# df = Discretize(df, ['housing_median_age', 'median_income'])
df = Discretize(df, [])

df = Normalize(df, [
    'longitude', 'latitude', 'housing_median_age', 'total_rooms',
    'total_bedrooms', 'population', 'households', 'median_income',
    'ocean_proximity','median_house_value'
])

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
def trainAtest(
        train_df: pd.DataFrame, test_df: pd.DataFrame, 
        target_column: str, train_size: float = 0.8,
) -> np.float64:
    # MSE
    model = LinearRegression()
    part_df, _ = train_test_split(train_df, train_size=train_size, random_state=42)
    model.fit(part_df.drop(columns=[target_column]), part_df[target_column])
    y_pred = model.predict(test_df.drop(columns=[target_column]))
    mse = mean_squared_error(test_df[target_column], y_pred)
    return mse

for r in [0.1, 0.3, 0.5, 0.8]:
    mse = trainAtest(train_df, test_df, 'median_house_value', r)
    print(f'{r}: {mse:.4f}')