# From Scratch

### Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np

# Load the California Housing Dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
housing_data = pd.read_csv(url)

### Basic Data Cleaning and Preprocessing

In [2]:
# Check for missing values
housing_data.info()
housing_data.isnull().sum()

# Fill missing values in "total_bedrooms" with the median value
housing_data["total_bedrooms"].fillna(housing_data["total_bedrooms"].median(), inplace=True)

# Encode categorical variable "ocean_proximity" using label encoding
categories = list(housing_data["ocean_proximity"].unique())
categories.sort()
mapping = {cat: i for i, cat in enumerate(categories)}
housing_data["ocean_proximity"] = housing_data["ocean_proximity"].replace(mapping)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing_data["total_bedrooms"].fillna(housing_data["total_bedrooms"].median(), inplace=True)
  housing_data["ocean_proximity"] = housing_data["ocean_proximity"].replace(mapping)


### Define Features (X) and Target Variable (y)

In [3]:
# Select features (independent variables)
features = list(housing_data.columns)
features.remove("median_house_value")  # Remove target variable

X = housing_data[features]
y = housing_data["median_house_value"]

# Convert X (features) to NumPy array for efficient calculations
X = np.array(X)
y = np.array(y)

### Data Standardization

In [4]:
# Calculate mean and standard deviation
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)

# Standardize the data
X = (X - X_mean) / X_std

### Linear Regression

In [5]:
class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=200):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def mean_squared_error(self, y_true, y_pred):
        return np.mean((y_true - y_pred) ** 2)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient Descent
        for _ in range(self.n_iterations):
            y_predicted = np.dot(X, self.weights) + self.bias

            print("MSE: ", self.mean_squared_error(y, y_predicted))

            # Compute gradients
            dw = (1/n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1/n_samples) * np.sum(y_predicted - y)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

model = LinearRegression()
model.fit(X, y)

MSE:  56104831989.87253
MSE:  55110393275.8855
MSE:  54135905333.35988
MSE:  53180953974.79889
MSE:  52245134567.87385
MSE:  51328051745.705765
MSE:  50429319130.49843
MSE:  49548559069.67764
MSE:  48685402383.752556
MSE:  47839488125.17134
MSE:  47010463347.49608
MSE:  46197982884.26969
MSE:  45401709136.99297
MSE:  44621311871.67066
MSE:  43856468023.42394
MSE:  43106861508.702446
MSE:  42372183044.66087
MSE:  41652129975.29664
MSE:  40946406103.971825
MSE:  40254721531.96986
MSE:  39576792502.76041
MSE:  38912341251.6688
MSE:  38261095860.66657
MSE:  37622790118.01907
MSE:  36997163382.543274
MSE:  36383960452.246
MSE:  35782931437.12716
MSE:  35193831635.947205
MSE:  34616421416.770805
MSE:  34050466101.110806
MSE:  33495735851.507793
MSE:  32952005562.391014
MSE:  32419054754.075775
MSE:  31896667469.761894
MSE:  31384632175.40539
MSE:  30882741662.34398
MSE:  30390792952.563637
MSE:  29908587206.500328
MSE:  29435929633.277115
MSE:  28972629403.282753
MSE:  28518499563.002926
MSE

### Prediction

In [6]:
print("True Value: ", y[-1])

pred = model.predict(X[-1])
print("Predicted Value: ", pred)

True Value:  89400.0
Predicted Value:  88129.5245777892


# Using Scikit-Learn

In [7]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

# Train the model with the data
model.fit(X, y)
# Make predictions
y_predicted = model.predict(X)

from sklearn.metrics import mean_squared_error

print("MSE: ", mean_squared_error(y, y_predicted))

MSE:  4852254699.072868
