# Import Libraries

In [99]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Data Loading

In [23]:
df = pd.read_csv("California_Houses.csv")

In [27]:
print(df.shape)

(20640, 14)


In [29]:
print(df.head())

   Median_House_Value  Median_Income  Median_Age  Tot_Rooms  Tot_Bedrooms  \
0            452600.0         8.3252          41        880           129   
1            358500.0         8.3014          21       7099          1106   
2            352100.0         7.2574          52       1467           190   
3            341300.0         5.6431          52       1274           235   
4            342200.0         3.8462          52       1627           280   

   Population  Households  Latitude  Longitude  Distance_to_coast  \
0         322         126     37.88    -122.23        9263.040773   
1        2401        1138     37.86    -122.22       10225.733072   
2         496         177     37.85    -122.24        8259.085109   
3         558         219     37.85    -122.25        7768.086571   
4         565         259     37.85    -122.25        7768.086571   

   Distance_to_LA  Distance_to_SanDiego  Distance_to_SanJose  \
0   556529.158342         735501.806984         67432.5170

In [31]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Median_House_Value        20640 non-null  float64
 1   Median_Income             20640 non-null  float64
 2   Median_Age                20640 non-null  int64  
 3   Tot_Rooms                 20640 non-null  int64  
 4   Tot_Bedrooms              20640 non-null  int64  
 5   Population                20640 non-null  int64  
 6   Households                20640 non-null  int64  
 7   Latitude                  20640 non-null  float64
 8   Longitude                 20640 non-null  float64
 9   Distance_to_coast         20640 non-null  float64
 10  Distance_to_LA            20640 non-null  float64
 11  Distance_to_SanDiego      20640 non-null  float64
 12  Distance_to_SanJose       20640 non-null  float64
 13  Distance_to_SanFrancisco  20640 non-null  float64
dtypes: flo

In [35]:
print(df.describe())

       Median_House_Value  Median_Income    Median_Age     Tot_Rooms  \
count        20640.000000   20640.000000  20640.000000  20640.000000   
mean        206855.816909       3.870671     28.639486   2635.763081   
std         115395.615874       1.899822     12.585558   2181.615252   
min          14999.000000       0.499900      1.000000      2.000000   
25%         119600.000000       2.563400     18.000000   1447.750000   
50%         179700.000000       3.534800     29.000000   2127.000000   
75%         264725.000000       4.743250     37.000000   3148.000000   
max         500001.000000      15.000100     52.000000  39320.000000   

       Tot_Bedrooms    Population    Households      Latitude     Longitude  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean     537.898014   1425.476744    499.539680     35.631861   -119.569704   
std      421.247906   1132.462122    382.329753      2.135952      2.003532   
min        1.000000      3.000000  

In [37]:
print(df.isnull().sum())
df.fillna(df.mean(), inplace=True)

Median_House_Value          0
Median_Income               0
Median_Age                  0
Tot_Rooms                   0
Tot_Bedrooms                0
Population                  0
Households                  0
Latitude                    0
Longitude                   0
Distance_to_coast           0
Distance_to_LA              0
Distance_to_SanDiego        0
Distance_to_SanJose         0
Distance_to_SanFrancisco    0
dtype: int64


In [37]:
print(df.isnull().sum())
df.fillna(df.mean(), inplace=True)

Median_House_Value          0
Median_Income               0
Median_Age                  0
Tot_Rooms                   0
Tot_Bedrooms                0
Population                  0
Households                  0
Latitude                    0
Longitude                   0
Distance_to_coast           0
Distance_to_LA              0
Distance_to_SanDiego        0
Distance_to_SanJose         0
Distance_to_SanFrancisco    0
dtype: int64


# Data Splitting

In [101]:
X = df.drop("Median_House_Value", axis=1).values
y = df["Median_House_Value"].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Train size:", X_train.shape[0])
print("Validation size:", X_val.shape[0])
print("Test size:", X_test.shape[0])

Train size: 14448
Validation size: 3096
Test size: 3096


In [74]:
X_train.shape , X_val.shape , X_test.shape 

((14448, 13), (3096, 13), (3096, 13))

# Create Model From Scratch

### Implementing using Normal Equation

In [207]:
#  Merge column 1 into X to
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val_b = np.c_[np.ones ((X_val.shape[0], 1)), X_val]

# Calculate weights (w)
w = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)

# prediction
y_val_pred = X_val_b.dot(w)
print(y_val_pred)

[117112.59499985 332653.97680117 406311.5956708  ... 254464.49377846
 142097.9688237  205083.4967788 ]


#### Feature Scaling

X_mean = np.mean(X_train, axis=0)
X_std = np.std(X_train, axis=0)
X_train_scaled = (X_train - X_mean) / X_std
X_val_scaled = (X_val - X_mean) / X_std

### Implemetation using Gradient Descent

In [170]:
def gradient_descent(X, y, learning_rate=0.01, iterations=1000):
    n = len(y)
    X_b = np.c_[np.ones((n, 1)), X]
    w = np.random.randn(X_b.shape[1], 1)
    y = np.array(y).reshape(-1, 1)

    for iteration in range(iterations):
        gradients = 2/n * X_b.T.dot(X_b.dot(w) - y)
        w -= learning_rate * gradients
    return w

w_gd = gradient_descent(X_train_scaled, y_train)
y_val_pred_gd = np.c_[np.ones((X_val.shape[0], 1)), X_val_scaled].dot(w_gd)
print(y_val_pred_gd)

[[124639.76014016]
 [338778.99633609]
 [402108.32176819]
 ...
 [233954.01149026]
 [159274.84100195]
 [211716.54776925]]


In [139]:
print(np.mean(y_train), np.std(y_train))
print(np.mean(y_val_pred_gd), np.std(y_val_pred_gd))


206923.9608942414 115745.23650984169
205464.92477700833 90868.05471370397


### L2 Regularization (Ridge Regression)

In [190]:
def ridge_regression(X, y, lampda=0.01):
    n = len(y)
    X_b = np.c_[np.ones((n, 1)), X]
    I = np.eye(X_b.shape[1])
    I[0,0] = 0  # Do not penalize bias operator.
    w_l2 = np.linalg.inv(X_b.T.dot(X_b) + lampda * n *I).dot(X_b.T).dot(y)
    return w_l2
w_l2 = ridge_regression(X_train_scaled, y_train)
y_val_pred_l2 = np.c_[np.ones((X_val.shape[0], 1)), X_val_scaled].dot(w_l2)
print(y_val_pred_l2)

[124073.98268416 334353.86884925 401782.00832662 ... 241680.16159623
 152309.03705059 210073.3202035 ]


In [188]:
print(np.mean(y_train), np.std(y_train))
print(np.mean(y_val_pred_l2), np.std(y_val_pred_l2))

206923.9608942414 115745.23650984169
205388.11060144924 92120.80930941543


### Mean Square Error (MSE)

In [199]:
def mean_squared_error_manual(y_true, y_pred):
    errors = y_true - y_pred                 
    squared_errors = errors ** 2             
    mse = 0.5 * np.mean(squared_errors)      
    return mse

### Mean Absolute Error (MAE)

In [201]:
def mean_absolute_error_manual(y_true, y_pred):
    errors = np.abs(y_true - y_pred)         
    mae = np.mean(errors)               
    return mae

In [211]:
mse_value = mean_squared_error_manual(y_val, y_val_pred)
mae_value = mean_absolute_error_manual(y_val, y_val_pred)

print(f"Manual MSE: {mse_value}")
print(f"Manual MAE: {mae_value}")

Manual MSE: 2453605998.6716285
Manual MAE: 50790.06027315035
