In [None]:
import numpy as np

class Linear_Regression:
    def __init__(self,learning_rate=0.01,epochs=100):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def fit(self,X,Y):
        X = np.asarray(X,dtype=np.float64) 
        Y = np.asarray(Y,dtype=np.float64)

        m,n = X.shape
        self.weights = np.zeros((n,1))
        self.bias = 0

        for epoch in range(self.epochs):
            y_pred = X @ self.weights + self.bias     #(m,1)

            dw = (1/m)*(X.T @ (y_pred - Y))
            db = (1/m)*np.sum(y_pred - Y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            self.calculate_loss(Y,y_pred)

        return self    

    def calculate_loss(self,Y,y_pred):
        return np.mean((Y - y_pred)**2)

    def predict(self,X):
        X = np.asarray(X,dtype=np.float64)
        y_pred = X @ self.weights + self.bias
        return y_pred
    
np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.randn(200,1)

model = Linear_Regression()
model.fit(X,Y)

print(model.predict(X[:10]))
print("-------")
print(Y[:10])

Cell 0 — Title & Plan (optional but nice)

Briefly list: objective, loss (MSE), optimizer (batch GD), regularization (L2 optional), metrics (MSE/RMSE/R²), early stopping.

Say out loud in interview: “I’ll implement linear regression with MSE + L2, trained by batch gradient descent, vectorized with NumPy.”

Cell 1 — Imports, Reproducibility, Dtypes

Import NumPy only (and matplotlib if you want a loss plot later).

Set a random seed.

Decide float64 everywhere (stability).

Note: You will avoid Python loops in the core math—use @ (matmul).

Checks

Print NumPy version (optional).

Explain: “I’ll stick to vectorized ops to keep it O(nd) per epoch.”

Linear Regression

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

A] Data Preprocessing

In [3]:
df = pd.read_csv('data_train.csv')
df_test = pd.read_csv('data_test.csv')

In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True) 
df.head()

Unnamed: 0,PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,797.0,-200.0,2.1,593.0,146.0,1212.0,72.0,984.0,494.0,10.8,49.7,0.6429
1,1282.0,-200.0,11.0,1013.0,354.0,545.0,141.0,1384.0,1287.0,17.4,50.6,0.9989
2,891.0,-200.0,7.6,882.0,342.0,885.0,149.0,950.0,894.0,7.8,33.9,0.3594
3,1285.0,-200.0,18.1,1243.0,481.0,599.0,173.0,1815.0,1582.0,26.4,41.9,1.4237
4,892.0,-200.0,7.3,869.0,71.0,953.0,77.0,1363.0,632.0,37.4,14.7,0.9295


In [5]:
df_test.drop(columns=['Unnamed: 0'], inplace=True)
df_test.head()

Unnamed: 0,NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,-200.0,8.0,898.0,122.0,933.0,105.0,1594.0,1098.0,17.0,51.7,0.9914
1,-200.0,19.4,1281.0,-200.0,774.0,-200.0,1952.0,1324.0,20.8,43.6,1.0614
2,-200.0,9.9,975.0,349.0,638.0,223.0,1243.0,1064.0,5.6,74.6,0.6826
3,-200.0,12.7,1075.0,103.0,749.0,98.0,1690.0,1022.0,31.7,21.5,0.9902
4,-200.0,2.9,647.0,131.0,1054.0,85.0,962.0,828.0,8.4,54.5,0.6022


In [6]:
df.shape

(6250, 12)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6250 entries, 0 to 6249
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PT08.S1(CO)    6173 non-null   float64
 1   NMHC(GT)       6173 non-null   float64
 2   C6H6(GT)       6173 non-null   float64
 3   PT08.S2(NMHC)  6173 non-null   float64
 4   NOx(GT)        6173 non-null   float64
 5   PT08.S3(NOx)   6173 non-null   float64
 6   NO2(GT)        6173 non-null   float64
 7   PT08.S4(NO2)   6173 non-null   float64
 8   PT08.S5(O3)    6173 non-null   float64
 9   T              6173 non-null   float64
 10  RH             6173 non-null   float64
 11  AH             6173 non-null   float64
dtypes: float64(12)
memory usage: 586.1 KB


In [8]:
df.isnull().sum()

PT08.S1(CO)      77
NMHC(GT)         77
C6H6(GT)         77
PT08.S2(NMHC)    77
NOx(GT)          77
PT08.S3(NOx)     77
NO2(GT)          77
PT08.S4(NO2)     77
PT08.S5(O3)      77
T                77
RH               77
AH               77
dtype: int64

In [10]:
df.dropna(inplace=True)
df_test.dropna(inplace=True)

In [11]:
df.columns

Index(['PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)',
       'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH',
       'AH'],
      dtype='object')

In [12]:
df_test.columns

Index(['NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)',
       'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH'],
      dtype='object')

In [13]:
df.count

<bound method DataFrame.count of       PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  \
0           797.0    -200.0       2.1          593.0    146.0        1212.0   
1          1282.0    -200.0      11.0         1013.0    354.0         545.0   
2           891.0    -200.0       7.6          882.0    342.0         885.0   
3          1285.0    -200.0      18.1         1243.0    481.0         599.0   
4           892.0    -200.0       7.3          869.0     71.0         953.0   
...           ...       ...       ...            ...      ...           ...   
6244        918.0    -200.0       4.5          737.0    220.0        1007.0   
6245        824.0    -200.0       2.2          602.0     54.0        1138.0   
6246       1003.0    -200.0       3.7          694.0    156.0         876.0   
6247        894.0    -200.0       4.0          709.0     46.0         997.0   
6248       1213.0    -200.0      21.6         1341.0    210.0         683.0   

      NO2(GT)  PT0

In [14]:
df_test.count

<bound method DataFrame.count of       NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  NOx(GT)  PT08.S3(NOx)  NO2(GT)  \
0       -200.0       8.0          898.0    122.0         933.0    105.0   
1       -200.0      19.4         1281.0   -200.0         774.0   -200.0   
2       -200.0       9.9          975.0    349.0         638.0    223.0   
3       -200.0      12.7         1075.0    103.0         749.0     98.0   
4       -200.0       2.9          647.0    131.0        1054.0     85.0   
...        ...       ...            ...      ...           ...      ...   
3216    -200.0      12.5         1068.0    171.0         899.0    139.0   
3217    -200.0       9.6          964.0   -200.0         953.0   -200.0   
3218    -200.0       1.2          522.0     61.0        1242.0     55.0   
3219    -200.0       8.7          927.0   -200.0         750.0   -200.0   
3220    -200.0      10.7         1004.0     90.0         724.0    114.0   

      PT08.S4(NO2)  PT08.S5(O3)     T    RH      AH  
0           

Extracting the Features and Labels

In [24]:
X_train = df[['NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)',
                  'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
                  'T', 'RH', 'AH']]
Y_train = df['PT08.S1(CO)']                  

Normalizing the data from scratch.

In [25]:
# NumPy is used for fast array operations and vectorization

class StandardScalerFromScratch:
    def __init__(self, epsilon=1e-8):
        """
        Constructor for the StandardScalerFromScratch class.
        
        Parameters:
        ----------
        epsilon : float
            A small constant added to the denominator during scaling to prevent
            division by zero (important when a feature has zero variance).
            
        Attributes initialized:
        -----------------------
        self.mean_ : stores feature-wise means (computed during fit).
        self.std_  : stores feature-wise standard deviations (computed during fit).
        """
        self.mean_ = None
        self.std_ = None
        self.epsilon = epsilon   # helps avoid numerical instability
    
    def fit(self, X):
        """
        Compute the mean and standard deviation for each feature (column).
        These values will later be used to scale the data.
        
        Parameters:
        ----------
        X : np.ndarray
            Input training data of shape (n_samples, n_features).
            
        Returns:
        --------
        self : object
            Returns the scaler itself, enabling method chaining.
        """
        # Convert to NumPy array (ensures consistency even if input is list/pandas DataFrame)
        X = np.asarray(X, dtype=np.float64)
        
        # Error handling: input must be 2D (samples, features)
        if X.ndim != 2:
            raise ValueError("Input data must be 2D of shape (n_samples, n_features)")
        
        # Store column-wise mean and std (population std with ddof=0)
        self.mean_ = X.mean(axis=0)
        self.std_ = X.std(axis=0, ddof=0)
        
        return self   # Returning self allows calls like scaler.fit(X).transform(X)
    
    def transform(self, X):
        """
        Scale the dataset using the mean and std computed in fit().
        
        Parameters:
        ----------
        X : np.ndarray
            Input data of shape (n_samples, n_features).
            
        Returns:
        --------
        X_scaled : np.ndarray
            Standardized data where each feature has mean≈0 and std≈1.
        """
        # Check that fit() has been called before transform()
        if self.mean_ is None or self.std_ is None:
            raise ValueError("Scaler has not been fitted yet. Call `fit` first.")
        
        # Convert to NumPy array
        X = np.asarray(X, dtype=np.float64)
        
        # Error handling: dimensions must match fitted data
        if X.shape[1] != self.mean_.shape[0]:
            raise ValueError("Shape mismatch: input data must have the same number of features as training data")
        
        # Apply scaling: (X - mean) / (std + epsilon)
        return (X - self.mean_) / (self.std_ + self.epsilon)
    
    def fit_transform(self, X):
        """
        Convenience method: fits to data, then transforms it.
        
        Parameters:
        ----------
        X : np.ndarray
            Input data of shape (n_samples, n_features).
            
        Returns:
        --------
        X_scaled : np.ndarray
            Standardized version of input.
        """
        return self.fit(X).transform(X)  # Method chaining
    
    def inverse_transform(self, X_scaled):
        """
        Convert scaled data back to original representation.
        
        Parameters:
        ----------
        X_scaled : np.ndarray
            Standardized data of shape (n_samples, n_features).
            
        Returns:
        --------
        X_original : np.ndarray
            Data in the original scale (before normalization).
        """
        if self.mean_ is None or self.std_ is None:
            raise ValueError("Scaler has not been fitted yet. Call `fit` first.")
        
        X_scaled = np.asarray(X_scaled, dtype=np.float64)
        
        # Inverse operation: X = X_scaled * std + mean
        return X_scaled * (self.std_ + self.epsilon) + self.mean_

🚦 Key Things to Remember (Interview POV)

Why normalize?

To ensure features are on a similar scale, especially for algorithms using gradient descent or distance-based methods (k-NN, SVM, k-Means).

Prevents features with larger magnitudes from dominating updates.

fit vs transform vs fit_transform

fit() → calculates parameters (mean, std) from training data only.

transform() → applies the scaling using those parameters.

fit_transform() → shortcut = fit() + transform() on the same data.

Rule: Always .fit() on train and .transform() both train & test with same params (avoid data leakage).

Why epsilon?

Prevents division by zero if a column has constant values (std = 0).

OOP best practice:

Attributes (mean_, std_) are saved inside the object, so you can call transform() multiple times consistently.

Error handling ensures user doesn’t misuse the API.

Complexity

Fit: O(n·d)

Transform: O(n·d)

Memory: O(d) (stores mean & std only)

❓ Interviewer Question Bank (Exhaustive)
Conceptual

Why do we normalize data before training ML models?

Which models benefit the most from standardization? (hint: GD-based, distance-based).

Difference between standardization and min-max scaling.

Why should we only fit on training data and not test?

What happens if a feature has zero variance?

How does normalization affect gradient descent convergence?

When might you not want to standardize? (e.g., tree-based models).

Compare z-score normalization vs min-max scaling in terms of outlier sensitivity.

Implementation-focused

Explain what fit(), transform(), and fit_transform() do.

Why do we add epsilon in the denominator?

Show how you’d implement an inverse_transform().

What errors could occur if you call transform() before fit()?

How do you check for shape mismatches between train/test?

Edge Cases

What happens if your dataset has missing values (NaNs)?

If all values in a column are identical, what does standardization yield?

How do you handle categorical features during normalization?

If you scale test data with its own mean and std, what’s the issue?

Extensions

Can you extend this to include partial_fit() for streaming data?

How would you adapt this for sparse matrices?

Can you make this class handle both float32 and float64 for efficiency?

What is Data Leakage?

Data leakage happens when information from outside the training dataset (e.g., from the validation or test set, or from the future) sneaks into the model during training.
This causes your model to look artificially good during training/validation but fail badly on unseen real-world data.

📌 Types of Data Leakage
1. Preprocessing leakage

Example: You scale your entire dataset (fit scaler on train+test combined) before splitting into train/test.

The mean/std of test features “leaked” into training.

Your model indirectly “saw” test data distribution.

👉 Fix: Always fit preprocessing (e.g., normalization, imputation, encoding) only on training, then transform train/test with the same params.

2. Feature leakage

Example: A feature contains info that wouldn’t be available at prediction time.

Predicting hospital readmission using “days until readmission” as a feature.

Predicting churn using “account closure date” as a feature.

Predicting loan default while including “loan repayment status” in input.

👉 Fix: Only include features available before prediction time.

3. Temporal leakage (time-series leakage)

Example: Using future data to predict the past.

Predicting stock price for Jan using features from Feb.

Training with shuffled data in a time-series task.

👉 Fix: Always split train/validation/test respecting time order.

4. Target leakage

Example: Features are too closely related to the target.

Predicting “whether someone will default” using “credit score after default event.”

Predicting “will a transaction be fraudulent?” using “is_fraud” column disguised.

👉 Fix: Remove features that encode or strongly correlate with the label itself.

🎯 Interview-Style Answer (30s version)

“Data leakage occurs when information that shouldn’t be available at training time — such as test data statistics, future information, or target-related features — leaks into the training process. This makes the model perform unrealistically well in training but fail in production. Common examples are normalizing with test data, including post-outcome features, or using future values in time-series. The fix is to strictly separate train/test and only fit preprocessing steps on the training set.”

In [48]:
scaler = StandardScalerFromScratch()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(df_test) 

normalized_features_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)

***You need to explain the choice of the type of Feature scaling you did.***


Why did you use Z-Score scaling? Were there outliers in the data? Which type of feature scaling is sensitive to outliers?

If after transformation, still there exists outliers or skewness, then consider performing log transformation.

In [49]:
mean = normalized_features_df.mean()
std = normalized_features_df.std()

print(mean)
print(std)

NMHC(GT)         7.481820e-18
C6H6(GT)        -4.604197e-18
PT08.S2(NMHC)   -1.234500e-16
NOx(GT)         -4.604197e-18
PT08.S3(NOx)    -6.100561e-17
NO2(GT)         -5.064617e-17
PT08.S4(NO2)    -1.945273e-16
PT08.S5(O3)      9.208394e-17
T                2.762518e-17
RH              -3.683358e-17
AH              -1.726574e-18
dtype: float64
NMHC(GT)         1.000081
C6H6(GT)         1.000081
PT08.S2(NMHC)    1.000081
NOx(GT)          1.000081
PT08.S3(NOx)     1.000081
NO2(GT)          1.000081
PT08.S4(NO2)     1.000081
PT08.S5(O3)      1.000081
T                1.000081
RH               1.000081
AH               1.000081
dtype: float64


After transformation we can see that mean is very close to zero for all features , and the standard deviation is close to one for all features. So transformations can be considered successFul.

Implementing the Linear Regression from Scratch

In [None]:
class LinearRegressionGD:
    # This is the Constructor - the purpose is to initialize the hyperparameters
    def __init__(self, learning_rate=0.01, epochs=1000, tol=1e-8, verbose=False):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.tol = tol
        self.verbose = verbose
        self.weights = None
        self.loss_history = []

    def add_bias(self, X):
        """Add bias column (intercept term)."""
        return np.hstack((np.ones((X.shape[0], 1)), X))

    def predict_raw(self, X):
        """Return raw predictions (X @ weights)."""
        return X @ self.weights

    def calculate_loss(self, X, Y):
        """Mean Squared Error (MSE)."""
        m = len(Y)
        residuals = self.predict_raw(X) - Y
        return float((residuals.T @ residuals)/(2*m))

    def fit(self, X, Y):
        """Train using Gradient Descent."""
        X = np.asarray(X, dtype=np.float64)
        Y = np.asarray(Y, dtype=np.float64).reshape(-1, 1)

        X = self.add_bias(X)
        m, n = X.shape

        # Initialize weights
        self.weights = np.zeros((n, 1))

        prev_loss = float("inf")
        for epoch in range(self.epochs):
            # Forward Pass
            preds = self.predict_raw(X)

            # Backward pass
            # Gradient of MSE
            gradients = (X.T @ (preds - Y)) / m    # Formula is actually (-1/m) X.T @ (Y - preds)

            # Weight update
            self.weights -= self.learning_rate * gradients

            # Compute and store loss
            loss = self.calculate_loss(X, Y)
            self.loss_history.append(loss)

            # Early stopping
            if abs(prev_loss - loss) < self.tol:
                if self.verbose:
                    print(f"Early stopping at epoch {epoch}, Loss = {loss:.6f}")
                break
            prev_loss = loss

            if self.verbose and epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss = {loss:.6f}")

        return self

    def predict(self, X):
        """Make predictions on new data."""
        X = np.asarray(X, dtype=np.float64)
        X = self.add_bias(X)
        return self.predict_raw(X)

    def score(self, X, Y):
        """Compute R² score."""
        Y = np.asarray(Y, dtype=np.float64).reshape(-1, 1)
        preds = self.predict(X)
        ss_res = np.sum((Y - preds) ** 2)
        ss_tot = np.sum((Y - np.mean(Y)) ** 2)
        return 1 - ss_res / ss_tot

In [86]:
model = LinearRegressionGD(learning_rate=0.01, epochs=1000, verbose=True)
model.fit(X_train_scaled,Y_train)

Epoch 0, Loss = 1185686.634669
Epoch 100, Loss = 151807.749989
Epoch 200, Loss = 25114.229218
Epoch 300, Loss = 8037.675146
Epoch 400, Loss = 5681.014327
Epoch 500, Loss = 5322.204312
Epoch 600, Loss = 5246.671220
Epoch 700, Loss = 5218.563286
Epoch 800, Loss = 5202.607675
Epoch 900, Loss = 5191.876222


  return float((residuals.T @ residuals) / m)


<__main__.LinearRegressionGD at 0x123b43230>

In [87]:
print("Final Cost:", model.loss_history[-1])

Final Cost: 5184.180865057258


In [88]:
print("Final parameters:", model.weights)

Final parameters: [[1053.09206916]
 [  33.21747102]
 [  51.6147512 ]
 [  75.1864726 ]
 [  23.21228718]
 [ -33.55860488]
 [  -8.05796003]
 [  28.42022405]
 [  97.32111494]
 [  16.12220203]
 [  54.30949035]
 [  37.48752197]]


In [89]:
predictions = model.predict(X_test_scaled)
print(predictions)

[[1086.94279247]
 [1254.9880515 ]
 [1141.17789745]
 ...
 [ 822.92137051]
 [1016.61612681]
 [1084.35051704]]


In [1]:
# Single Forward and Backward Pass
import numpy as np

# Example data
np.random.seed(42)
X = np.random.randn(5, 3)          # 5 samples, 3 features
y = np.random.randn(5, 1)          # targets
w = np.zeros((X.shape[1], 1))      # initialize weights (3x1)
alpha = 0.01                       # learning rate
m = X.shape[0]                     # number of samples

# ---------- Forward Pass ----------
preds = X @ w                      # predictions (5x1)
residuals = preds - y              # error (5x1)
loss = (residuals.T @ residuals) / m   # Mean Squared Error (scalar)

# ---------- Backward Pass ----------
gradients = (X.T @ residuals) / m  # gradient wrt weights (3x1)
w = w - alpha * gradients          # weight update

print("Loss:", float(loss))
print("Updated weights:\n", w)

Loss: 0.8519709992865122
Updated weights:
 [[-0.00431997]
 [ 0.00735799]
 [ 0.00516885]]


  print("Loss:", float(loss))


In [None]:
# This is a better code from scratch.
import numpy as np


class LinearRegression:
    def __init__(self, lr: int = 0.01, n_iters: int = 1000) -> None:
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape     # X shape [N, f]
        self.weights = np.random.rand(num_features)  # W shape [f, 1]
        self.bias = 0

        for i in range(self.n_iters):

            # y_pred shape should be N, 1
            y_pred = np.dot(X, self.weights) + self.bias

            # X -> [N,f]
            # y_pred -> [N]
            # dw -> [f]
            dw = (1 / num_samples) * np.dot(X.T, y_pred - y)
            db = (1 / num_samples) * np.sum(y_pred - y)

            self.weights = self.weights - self.lr * dw
            self.bias = self.bias - self.lr * db

        return self

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [None]:
import numpy as np

class Linear_Regression:
    def __init__(self,learning_rate=0.01,epochs=100):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def fit(self,X,Y):
        X = np.asarray(X,dtype=np.float64) 
        Y = np.asarray(Y,dtype=np.float64)

        m,n = X.shape
        self.weights = np.zeros((n,1))
        self.bias = 0

        for epoch in range(self.epochs):
            y_pred = X @ self.weights + self.bias     #(m,1)

            dw = (1/m)*(X.T @ (y_pred - Y))
            db = (1/m)*np.sum(y_pred - Y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            self.calculate_loss(Y,y_pred)

        return self    

    def calculate_loss(self,Y,y_pred):
        return np.mean((Y - y_pred)**2)

    def predict(self,X):
        X = np.asarray(X,dtype=np.float64)
        y_pred = X @ self.weights + self.bias
        return y_pred
    
np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.randn(200,1)

model = Linear_Regression()
model.fit(X,Y)

print(model.predict(X[:10]))
print("-------")
print(Y[:10])

[[0.0996359 ]
 [0.07937433]
 [0.04265128]
 [0.18518063]
 [0.03827195]
 [0.14940005]
 [0.05840756]
 [0.0690662 ]
 [0.13405166]
 [0.182802  ]]
-------
[[ 0.93828381]
 [-0.51604473]
 [ 0.09612078]
 [-0.46227529]
 [-0.43449623]
 [-0.30917212]
 [ 0.22213377]
 [-0.47874862]
 [ 1.25575613]
 [-0.8946073 ]]
