In [32]:
import pandas as pd
import numpy as np

In [33]:
df=pd.read_csv("housing.csv")

In [34]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [35]:
filtered_df = df.loc[df['ocean_proximity'].isin(['<1H OCEAN', 'INLAND'])]

In [36]:
filtered_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0,<1H OCEAN
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0,<1H OCEAN
859,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1H OCEAN
861,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,<1H OCEAN


In [37]:
filtered_df = filtered_df[['latitude',
'longitude',
'housing_median_age',
'total_rooms',
'total_bedrooms',
'population',
'households',
'median_income',
'median_house_value']]

In [38]:
filtered_df.head()

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
701,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
830,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
859,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
860,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
861,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0


In [39]:
filtered_df.isnull().sum()
# Q1--> total_bedrooms


latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [40]:
filtered_df.population.median()
# Q2--> 1195

1195.0

In [41]:
filtered_df = filtered_df.reset_index(drop=True)

In [42]:
filtered_df

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
1,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
2,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
3,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
4,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0
...,...,...,...,...,...,...,...,...,...
15682,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
15683,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
15684,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
15685,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [43]:
n=len(filtered_df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test


np.random.seed(42)
idx = np.arange(n)
np.random.shuffle(idx)

In [44]:
train_df = filtered_df.iloc[idx[:n_train]]
val_df = filtered_df.iloc[idx[n_train:n_train+n_val]]
test_df = filtered_df.iloc[idx[n_train+n_val:]]

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [45]:
y_train= np.log1p(train_df['median_house_value'])
y_val = np.log1p(val_df['median_house_value'])
y_test = np.log1p(test_df['median_house_value'])

In [46]:
del train_df['median_house_value']
del val_df['median_house_value']
del test_df['median_house_value']


In [47]:
train_df_0 = train_df.fillna(0)
train_df_mean = train_df.fillna(train_df.total_bedrooms.mean())

val_df_0=val_df.fillna(0)
val_df_mean=val_df.fillna(train_df.total_bedrooms.mean())

test_df=test_df.fillna(0)


In [48]:
train_df_0.total_bedrooms.mean()

537.1349197917773

In [49]:
train_df_mean.total_bedrooms.mean()

542.552956325786

In [50]:
# Filling the missing values with mean


In [51]:
X_train_0 = train_df_0.values
X_train_mean = train_df_mean.values
# X_val = val_df.values
X_test = test_df.values

In [52]:
X_val_0 = val_df_0.values
X_val_mean = val_df_mean.values

In [53]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [54]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [55]:
w0, w = train_linear_regression(X_train_0, y_train)
y_pred = w0 + X_val_0.dot(w)
round(rmse(y_val, y_pred),2)

0.34

In [56]:
w0, w = train_linear_regression(X_train_mean, y_train)
y_pred = w0 + X_val_mean.dot(w)
round(rmse(y_val, y_pred),2)

0.34

In [57]:
# Q3 both have the same value

In [58]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [59]:
# Q4 all options (first 4 are the same) --> smallest r in the first options is 0
rs= [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]
for r in rs:
    w0, w = train_linear_regression_reg(X_train_0, y_train,r)
    y_pred = w0 + X_val_0.dot(w)
    print(round(rmse(y_val, y_pred),2))

0.34
0.34
0.34
0.34
0.34
0.34
0.34
0.35
0.35


In [60]:
scores = []
seeds=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
for seed in seeds:
    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)
    
    train_df = filtered_df.iloc[idx[:n_train]]
    val_df = filtered_df.iloc[idx[n_train:n_train+n_val]]
    
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    
    y_train= np.log1p(train_df['median_house_value'])
    y_val = np.log1p(val_df['median_house_value'])
    
    del train_df['median_house_value']
    del val_df['median_house_value']
    
    train_df_0 = train_df.fillna(0)
    X_train_0 = train_df_0.values

    val_df = val_df.fillna(0)
    X_val = val_df.values

    w0, w = train_linear_regression(X_train_0, y_train)
    y_pred = w0 + X_val.dot(w)
    scores.append(rmse(y_val, y_pred))

print(round(np.std(scores),3))

# Q5 --> 0.005

0.005


In [61]:
np.random.seed(9)
r=0.001
idx = np.arange(n)
np.random.shuffle(idx)
train_df = filtered_df.iloc[idx[:n_train]]
val_df = filtered_df.iloc[idx[n_train:n_train+n_val]]
test_df = filtered_df.iloc[idx[n_train+n_val:]]

y_train= np.log1p(train_df['median_house_value'])
y_val = np.log1p(val_df['median_house_value'])
y_test = np.log1p(test_df['median_house_value'])

del train_df['median_house_value']
del val_df['median_house_value']
del test_df['median_house_value']
    


df_full_train = pd.concat([train_df, val_df])
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = np.concatenate([y_train, y_val])


df_full_train = df_full_train.fillna(0)
X_full_train = df_full_train.values
X_test = test_df.values

w0, w = train_linear_regression_reg(X_full_train, y_full_train,r)
y_pred = w0 + X_test.dot(w)
print(round(rmse(y_test, y_pred),2))




0.33
