In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['ocean_proximity']

In [5]:
for col in strings: 
    df[col]= df[col].str.lower().str.replace(' ', '_')

In [6]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [7]:
for col in df.columns:
    print(col)
    print(df[col].nunique())
    print()

longitude
844

latitude
862

housing_median_age
52

total_rooms
5926

total_bedrooms
1923

population
3888

households
1815

median_income
12928

median_house_value
3842

ocean_proximity
5



# Data Preparation

In [8]:
filtered_data = df[df['ocean_proximity'].isin(['<1h_ocean', 'inland'])]
selected_columns = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']
filtered_data = filtered_data[selected_columns]
df = filtered_data.reset_index(drop=True)
filtered_data

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
701,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
830,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
859,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
860,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
861,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0
...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


# Q-1

In [9]:
missing = filtered_data.isnull().sum()
missing

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

# Q-2

In [10]:
population1 = filtered_data['population']
percentile = np.percentile(population1, 50)
print(f'Median %50 percentile: {percentile}')

Median %50 percentile: 1195.0


# Q-3

In [11]:
df.isnull().sum()

latitude                0
longitude               0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
dtype: int64

In [12]:
n = len(df)
n_val = int(n*0.2)
n_test = int(n*0.2)
n_train = n - n_val - n_test

In [13]:
n, n_val, n_test, n_train

(15687, 3137, 3137, 9413)

In [14]:
df_train = filtered_data.iloc[:n_train]
df_train

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
701,37.64,-121.97,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0
830,37.61,-121.99,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0
859,37.57,-121.97,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0
860,37.58,-121.96,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0
861,37.58,-121.98,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0
...,...,...,...,...,...,...,...,...,...
11480,33.72,-118.03,24.0,5203.0,957.0,2465.0,946.0,5.1630,261000.0
11481,33.72,-118.04,24.0,7141.0,1330.0,3418.0,1268.0,4.6649,237800.0
11482,33.73,-117.99,24.0,2104.0,421.0,1181.0,414.0,3.8365,250900.0
11483,33.73,-118.00,26.0,2236.0,280.0,809.0,282.0,6.7395,342800.0


In [15]:
df_val = filtered_data.iloc[n_train: n_train + n_val]
df_val

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
11485,33.72,-117.99,26.0,1787.0,275.0,801.0,270.0,5.5514,255700.0
11486,33.72,-117.99,17.0,2801.0,649.0,1473.0,535.0,4.2875,134800.0
11487,33.72,-117.99,14.0,2127.0,537.0,1338.0,475.0,3.6280,188500.0
11488,33.71,-118.01,18.0,6565.0,1357.0,3079.0,1248.0,4.7515,295600.0
11489,33.70,-118.01,24.0,3856.0,567.0,1741.0,588.0,7.2480,302700.0
...,...,...,...,...,...,...,...,...,...
16253,37.96,-121.27,52.0,583.0,114.0,310.0,93.0,2.5625,54200.0
16254,37.98,-121.26,41.0,1633.0,433.0,885.0,413.0,0.9782,54200.0
16255,37.98,-121.25,39.0,1765.0,414.0,1056.0,414.0,1.5329,48300.0
16256,37.97,-121.26,31.0,1189.0,295.0,891.0,292.0,2.5536,50500.0


In [16]:
df_test = filtered_data.iloc[n_train +n_val:]
df_test

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
16258,37.96,-121.26,43.0,940.0,208.0,690.0,181.0,2.3056,62300.0
16259,37.97,-121.26,41.0,2398.0,448.0,1143.0,444.0,3.0352,69800.0
16260,37.97,-121.25,41.0,855.0,189.0,716.0,206.0,2.0375,75000.0
16261,37.96,-121.26,35.0,1511.0,316.0,892.0,304.0,1.7898,63500.0
16262,37.95,-121.25,40.0,1703.0,362.0,1208.0,373.0,2.0817,55300.0
...,...,...,...,...,...,...,...,...,...
20635,39.48,-121.09,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,39.49,-121.21,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,39.43,-121.22,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,39.43,-121.32,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [17]:
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)

In [18]:
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train: n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]

In [19]:
len(df_train), len(df_val), len(df_test)

(9413, 3137, 3137)

In [20]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [21]:
y_train = np.log1p(df_train.median_house_value.values)
y_val = np.log1p(df_val.median_house_value.values)
y_test = np.log1p(df_test.median_house_value.values)

In [22]:
len(y_train) , len(y_test),len(y_val)

(9413, 3137, 3137)

## Part I - Dealing with 0 

In [23]:
df_train_0 = df_train.copy(deep = True)


In [24]:
df_train_0['total_bedrooms'] = df_train['total_bedrooms'].fillna(0).values
df_val['total_bedrooms']= df_val['total_bedrooms'].fillna(0).values
df_test['total_bedrooms'] = df_test['total_bedrooms'].fillna(0).values

In [25]:
del df_train_0['median_house_value']
del df_val_0['median_house_value']
del df_test_0['median_house_value']

NameError: name 'df_val_0' is not defined

In [26]:
df_train_0.columns

Index(['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')

In [27]:
base =['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'] 

In [28]:
X_train = df_train_0[base].values

In [29]:
y_train

array([11.16196275, 12.32163544, 12.4718963 , ..., 13.12236738,
       12.11888489, 12.52924613])

In [30]:
def train_linear_regression(X,y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [31]:
w0, w = train_linear_regression(X_train, y_train)

In [32]:
y_pred = w0 + X_train.dot(w)
y_pred

array([11.48901968, 12.14856943, 12.45852739, ..., 12.91332356,
       11.93076692, 12.11614341])

In [33]:
def rmse(y, y_pred):
    error = y - y_pred
    se = error **2
    mse = se.mean()
    return np.sqrt(mse)

In [34]:
rmse(y_train, y_pred)

0.34027197158295336

# Validation Model

In [35]:
X_train 

array([[ 3.6230e+01, -1.1914e+02,  2.2000e+01, ...,  1.9270e+03,
         5.3000e+02,  2.5875e+00],
       [ 3.4120e+01, -1.1779e+02,  1.6000e+01, ...,  1.3190e+03,
         4.4600e+02,  4.8125e+00],
       [ 3.3680e+01, -1.1797e+02,  2.6000e+01, ...,  1.9300e+03,
         5.8500e+02,  5.7301e+00],
       ...,
       [ 3.4130e+01, -1.1808e+02,  4.6000e+01, ...,  3.7700e+02,
         1.4500e+02,  8.4546e+00],
       [ 3.4290e+01, -1.1846e+02,  2.4000e+01, ...,  3.1510e+03,
         8.1000e+02,  3.0526e+00],
       [ 3.4000e+01, -1.1841e+02,  1.8000e+01, ...,  8.8400e+02,
         4.5600e+02,  2.9338e+00]])

In [36]:
w0, w = train_linear_regression(X_train, y_train)

In [37]:
y_pred = w0 + X_train.dot(w)

In [38]:
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    
    return X

In [39]:
X_train = prepare_X(df_train_0)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)

y_pred = w0 + X_val.dot(w)

score = rmse(y_val, y_pred)
round(score,2)

# accurancy 


#validation variable which is using validation model.
#1) I already find also train model variable. 

0.34

## Part II - Dealing with Mean

In [40]:
df_train.isnull().sum()

latitude               0
longitude              0
housing_median_age     0
total_rooms            0
total_bedrooms        94
population             0
households             0
median_income          0
median_house_value     0
dtype: int64

In [41]:
df_means = df_train['total_bedrooms'].mean()
df_means

542.552956325786

In [42]:
df_train_mean = df_train.copy()


In [43]:
df_train_mean['total_bedrooms'] = df_train['total_bedrooms'].fillna(df_means).values


In [44]:
df_train_mean['total_bedrooms'].isnull().sum()

0

In [45]:
base =['latitude', 'longitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'] 

In [46]:
X_train = df_train_mean[base].values

In [47]:
y_train

array([11.16196275, 12.32163544, 12.4718963 , ..., 13.12236738,
       12.11888489, 12.52924613])

In [48]:
def train_linear_regression(X,y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [49]:
w0, w = train_linear_regression(X_train, y_train)

In [50]:
y_pred = w0 + X_train.dot(w)
y_pred

array([11.48148474, 12.14204505, 12.45148382, ..., 12.9139426 ,
       11.93466055, 12.11281089])

In [51]:
def rmse(y, y_pred):
    error = y - y_pred
    se = error **2
    mse = se.mean()
    return np.sqrt(mse)

In [52]:
rmse(y_train, y_pred)

0.34003008222642594

# 0.34027197158295336

# Validation Model

In [53]:
X_train = df_train_mean[base].fillna(df_means).values

In [54]:
w0, w = train_linear_regression(X_train, y_train)
y_pred = w0 + X_train.dot(w)

In [55]:
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(df_means)
    X = df_num.values
    
    return X

In [56]:
X_train = prepare_X(df_train_mean)
w0, w = train_linear_regression(X_train, y_train)

X_val = prepare_X(df_val)

y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
round(score,2)

0.34

# Q-4

In [57]:
def train_linear_regression_regulation(X, y, r = 0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    
    XTX =X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [58]:
df_train_0

Unnamed: 0,latitude,longitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,36.23,-119.14,22.0,2935.0,523.0,1927.0,530.0,2.5875
1,34.12,-117.79,16.0,2426.0,426.0,1319.0,446.0,4.8125
2,33.68,-117.97,26.0,3653.0,568.0,1930.0,585.0,5.7301
3,34.10,-118.03,32.0,2668.0,609.0,1512.0,541.0,2.9422
4,37.34,-121.87,39.0,2479.0,541.0,1990.0,506.0,2.4306
...,...,...,...,...,...,...,...,...
9408,34.44,-119.15,33.0,2005.0,392.0,1043.0,351.0,5.3080
9409,36.13,-119.13,28.0,1673.0,385.0,1434.0,371.0,2.0586
9410,34.13,-118.08,46.0,1238.0,147.0,377.0,145.0,8.4546
9411,34.29,-118.46,24.0,3668.0,890.0,3151.0,810.0,3.0526


In [59]:
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    X_train = prepare_X(df_train_0)
    w0, w = train_linear_regression_regulation(X_train, y_train, r=r)
    
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)

0 -9.763249480961484 0.3408479034170524
1e-06 -9.763228834412654 0.3408479061769051
0.0001 -9.76118523895649 0.34084818005444284
0.001 -9.742646252554156 0.34085069218976816
0.01 -9.561056196474489 0.34087793004503447
0.1 -8.05888977211378 0.3412862041967035
1 -3.133154279369632 0.3448958327629285
5 -0.8410867977013567 0.3477398070477501
10 -0.43811723165904154 0.3483149833517867


In [60]:
# 0.3408479034170524 exact the same 0.3408479034170524 the answer is 0

SyntaxError: invalid syntax (3898247304.py, line 1)

# Q-5

In [61]:
df_copy = df.copy()

In [None]:
 df_copy['total_bedrooms'] = df_copy['total_bedrooms'].fillna(0)

In [None]:
df_copy.describe()

In [None]:
n = len(df_copy)

In [None]:
    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

In [None]:
    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

In [62]:
def prepare_data(df:pd.DataFrame, seed:int):
    df_copy = df.copy()
    df_copy['total_bedrooms'] = df_copy['total_bedrooms'].fillna(0)


    n = len(df_copy)

    idx = np.arange(n)
    np.random.seed(seed)
    np.random.shuffle(idx)

    n_val = int(n * 0.2)
    n_test = int(n * 0.2)
    n_train = n - n_val - n_test

    df_train1 = df_copy.iloc[idx[:n_train]]
    df_train1 = df_train1.reset_index(drop=True)

    df_val1 = df_copy.iloc[idx[n_train: n_train + n_val]]
    df_val1 = df_val1.reset_index(drop=True)

    df_test1 = df_copy.iloc[idx[n_train + n_val:]]
    df_test1 = df_test1.reset_index(drop=True)
    
    del df_train1['median_house_value']
    del df_val1['median_house_value']
    del df_test1['median_house_value']
    
    y_train = np.log1p(df_train.median_house_value.values)
    y_val = np.log1p(df_val.median_house_value.values)
    y_test = np.log1p(df_test.median_house_value.values)



    return df_train1, df_val1, df_test1, y_train, y_val, y_test

In [None]:
len(df_train1), len(df_val1), len(df_test1)

In [65]:
def train_and_pred(df:pd.DataFrame, seed:int):

    df_train, df_val, df_test, y_train, y_val, y_test = prepare_data(df, seed)
    df_train = df_train.fillna(0)
    df_val = df_val.fillna(0)
    df_test = df_test.fillna(0)
    
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_regulation(X_train,
                                               y_train,
                                                  r=0)
    
    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    
    print(r, w0, score)
    return score

In [82]:
rmse_scores = {}
for seed in range(10):
    rmse_scores[seed] = train_and_pred(df, seed)
    

10 12.265678439039542 0.57480617855295
10 11.524843433174919 0.5745045163229128
10 11.614880414459712 0.574280229069338
10 12.15517556335052 0.5742667268530498
10 12.343016989891748 0.5746549480622645
10 13.393807316425729 0.5741559797221913
10 12.068667149557246 0.5746757570417933
10 11.858259715679646 0.5746185212473118
10 12.140562041569243 0.5742971941239566
10 11.516081607786038 0.5743090170687162


In [85]:
std_deviation = np.std(list(rmse_scores.values()))
rounded_std_deviation = round(std_deviation, 3)

print(rounded_std_deviation)

0.0


In [77]:
df_values = pd.DataFrame(rmse_scores.items(), 
                         columns=['seed', 'score'])

In [79]:
np.std(df_values['score'])

0.00021043109892632822

# Q-6

In [90]:
df_full_train = pd.concat([df_train, df_val])

In [91]:
df_full_train = df_full_train.reset_index(drop=True)

In [92]:
X_full_train = prepare_X(df_full_train)

In [95]:
y_full_train = np.concatenate([y_train, y_val])

In [96]:
w0, w = train_linear_regression_regulation(X_full_train, y_full_train, r=0.0001)

In [97]:
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)

score

0.33049973177691405