In [None]:
import numpy as np

class LogisticRegression:
    def __init__(self,learning_rate=0.01,epochs=1000,threshold=0.5):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.threshold = threshold
        self.weights = None
        self.bias = None

    def sigmoid(self,z):
        return 1/(1 + np.exp(-z)) 

    def calculate_loss(self,y_pred,y_true):
        eps = 1e-9
        return -np.mean(np.sum(y_true*np.log(y_pred+eps) + (1-y_true)*np.log(1-y_pred+eps)))   

    def fit(self,X,Y):
        X = np.asarray(X,dtype=np.float64)
        Y = np.asarray(Y,dtype=np.float64).reshape(-1,1)

        num_samples, num_features = X.shape
        self.weights = np.zeros((num_features,1))
        self.bias = 0

        for epoch in range(self.epochs):
            y_pred = self.sigmoid(X @ self.weights + self.bias)

            dw = (1/num_samples)*(X.T @ (y_pred - Y))
            db = (1/num_samples)*np.sum(y_pred-Y)

            self.weights = self.weights - self.learning_rate*dw
            self.bias = self.bias - self.learning_rate*db

            self.calculate_loss(y_pred,Y)

        return self

    def predict(self,X):
        X = np.asarray(X,dtype=np.float64)
        probabilities = self.sigmoid(X @ self.weights + self.bias)    
        return (probabilities >= self.threshold).astype(int)
    
    def score(self,X,Y):
        Y = np.asarray(Y, dtype=np.float64).reshape(-1, 1)
        preds = self.predict(X)
        return np.mean(preds == Y)
    
import numpy as np
np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.choice(2,200)

model = LogisticRegression()
model.fit(X,Y)

model.predict(X[:20])
model.score(X[:20],Y[:20])    

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
df = pd.read_csv('data_train.csv')
df_test = pd.read_csv('data_test.csv')

In [3]:
df.drop(columns=['Unnamed: 0'], inplace=True) 
df.head()

Unnamed: 0,PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,797.0,-200.0,2.1,593.0,146.0,1212.0,72.0,984.0,494.0,10.8,49.7,0.6429
1,1282.0,-200.0,11.0,1013.0,354.0,545.0,141.0,1384.0,1287.0,17.4,50.6,0.9989
2,891.0,-200.0,7.6,882.0,342.0,885.0,149.0,950.0,894.0,7.8,33.9,0.3594
3,1285.0,-200.0,18.1,1243.0,481.0,599.0,173.0,1815.0,1582.0,26.4,41.9,1.4237
4,892.0,-200.0,7.3,869.0,71.0,953.0,77.0,1363.0,632.0,37.4,14.7,0.9295


In [4]:
df_test.drop(columns=['Unnamed: 0'], inplace=True)
df_test.head()

Unnamed: 0,NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,-200.0,8.0,898.0,122.0,933.0,105.0,1594.0,1098.0,17.0,51.7,0.9914
1,-200.0,19.4,1281.0,-200.0,774.0,-200.0,1952.0,1324.0,20.8,43.6,1.0614
2,-200.0,9.9,975.0,349.0,638.0,223.0,1243.0,1064.0,5.6,74.6,0.6826
3,-200.0,12.7,1075.0,103.0,749.0,98.0,1690.0,1022.0,31.7,21.5,0.9902
4,-200.0,2.9,647.0,131.0,1054.0,85.0,962.0,828.0,8.4,54.5,0.6022


In [5]:
df.shape

(6250, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6250 entries, 0 to 6249
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PT08.S1(CO)    6173 non-null   float64
 1   NMHC(GT)       6173 non-null   float64
 2   C6H6(GT)       6173 non-null   float64
 3   PT08.S2(NMHC)  6173 non-null   float64
 4   NOx(GT)        6173 non-null   float64
 5   PT08.S3(NOx)   6173 non-null   float64
 6   NO2(GT)        6173 non-null   float64
 7   PT08.S4(NO2)   6173 non-null   float64
 8   PT08.S5(O3)    6173 non-null   float64
 9   T              6173 non-null   float64
 10  RH             6173 non-null   float64
 11  AH             6173 non-null   float64
dtypes: float64(12)
memory usage: 586.1 KB


In [7]:
df.dropna(inplace=True)
df_test.dropna(inplace=True)

In [8]:
X_train = df[['NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)',
                  'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
                  'T', 'RH', 'AH']]
Y_train = df['PT08.S1(CO)']                  

In [11]:
#Normalizing the data
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(df_test)

normalized_features_df = pd.DataFrame(X_train_scaled , columns=X_train.columns)

Logistic Regression from Scratch

In [None]:
import numpy as np

class LogisticRegressionGD:
    def __init__(self, learning_rate=0.01, epochs=1000, tol=1e-8, verbose=False):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.tol = tol
        self.verbose = verbose
        self.weights = None
        self.loss_history = []

    def add_bias(self, X):
        return np.hstack((np.ones((X.shape[0], 1)), X))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def predict_raw(self, X):
        return self.sigmoid(X @ self.weights)

    def calculate_loss(self, X, Y):
        """Binary Cross-Entropy Loss."""
        m = len(Y)
        h = self.predict_raw(X)
        # Add epsilon to avoid log(0)
        eps = 1e-9
        return float(-(1/m) * np.sum(Y*np.log(h+eps) + (1-Y)*np.log(1-h+eps)))

    def fit(self, X, Y):
        X = np.asarray(X, dtype=np.float64)
        Y = np.asarray(Y, dtype=np.float64).reshape(-1, 1)
        X = self.add_bias(X)
        m, n = X.shape

        self.weights = np.zeros((n, 1))
        prev_loss = float("inf")

        for epoch in range(self.epochs):
            preds = self.predict_raw(X)

            # Gradient (same formula as Linear Regression, but with sigmoid preds)
            gradients = (X.T @ (preds - Y)) / m
            self.weights -= self.learning_rate * gradients

            # Loss tracking
            loss = self.calculate_loss(X, Y)
            self.loss_history.append(loss)

            if abs(prev_loss - loss) < self.tol:
                if self.verbose:
                    print(f"Early stopping at epoch {epoch}, Loss = {loss:.6f}")
                break
            prev_loss = loss

            if self.verbose and epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss = {loss:.6f}")

        return self

    def predict_proba(self, X):
        X = np.asarray(X, dtype=np.float64)
        X = self.add_bias(X)
        return self.predict_raw(X)

    def predict(self, X, threshold=0.5):
        """Convert probabilities into 0/1 predictions."""
        return (self.predict_proba(X) >= threshold).astype(int)

    def score(self, X, Y):
        """Accuracy score."""
        Y = np.asarray(Y, dtype=np.float64).reshape(-1, 1)
        preds = self.predict(X)
        return np.mean(preds == Y)

In [16]:
binary_labels_train = np.where(Y_train > 1000, 1, 0)
x = normalized_features_df.fillna(0).to_numpy()
y = binary_labels_train.ravel()

logistic_model = LogisticRegressionGD(learning_rate=0.01,epochs=1000,verbose=True)
logistic_model.fit(x,y)

Epoch 0, Loss = 0.688659
Epoch 100, Loss = 0.474791
Epoch 200, Loss = 0.403465
Epoch 300, Loss = 0.365755
Epoch 400, Loss = 0.341681
Epoch 500, Loss = 0.324653
Epoch 600, Loss = 0.311821
Epoch 700, Loss = 0.301726
Epoch 800, Loss = 0.293535
Epoch 900, Loss = 0.286733


<__main__.LogisticRegressionGD at 0x13ca2e0d0>

In [17]:
predictions = logistic_model.predict(normalized_features_df)
print(predictions)

[[0]
 [1]
 [0]
 ...
 [0]
 [0]
 [1]]


Performing Inference on the Trained model

In [18]:
logistic_model_pred = logistic_model.predict(X_test_scaled)
print(logistic_model_pred)

[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [1]]


Below is a better implementation of the Logistic Regression from scratch

In [None]:
import numpy as np

class LogisticRegression:
    def __init__(self,learning_rate=0.01,epochs=1000,threshold=0.5):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.threshold = threshold
        self.weights = None
        self.bias = None

    def sigmoid(self,z):
        return 1/(1 + np.exp(-z)) 

    def calculate_loss(self,y_pred,y_true):
        eps = 1e-9
        return -np.mean(np.sum(y_true*np.log(y_pred+eps) + (1-y_true)*np.log(1-y_pred+eps)))   

    def fit(self,X,Y):
        X = np.asarray(X,dtype=np.float64)
        Y = np.asarray(Y,dtype=np.float64).reshape(-1,1)

        num_samples, num_features = X.shape
        self.weights = np.zeros((num_features,1))
        self.bias = 0

        for epoch in range(self.epochs):
            y_pred = self.sigmoid(X @ self.weights + self.bias)

            dw = (1/num_samples)*(X.T @ (y_pred - Y))
            db = (1/num_samples)*np.sum(y_pred-Y)

            self.weights = self.weights - self.learning_rate*dw
            self.bias = self.bias - self.learning_rate*db

            self.calculate_loss(y_pred,Y)

        return self

    def predict(self,X):
        X = np.asarray(X,dtype=np.float64)
        probabilities = self.sigmoid(X @ self.weights + self.bias)    
        return (probabilities >= self.threshold).astype(int)
    
    def score(self,X,Y):
        Y = np.asarray(Y, dtype=np.float64).reshape(-1, 1)
        preds = self.predict(X)
        return np.mean(preds == Y)
    
import numpy as np
np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.choice(2,200)

model = LogisticRegression()
model.fit(X,Y)

model.predict(X[:20])
model.score(X[:20],Y[:20])    

In [16]:
import numpy as np
np.random.seed(42)
X = np.random.randn(200,4)
Y = np.random.choice(2,200)

model = LogisticRegression()
model.fit(X,Y)

model.predict(X[:20])
model.score(X[:20],Y[:20])

np.float64(0.3)