# Implementation of the Logistic Regression Algorithm

In [1]:
import pandas as pd
import numpy as np

df= pd.read_csv('../day5_titanic/train.csv')

In [4]:
df.head(2)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
y=df['Survived']
X=df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

### Question: Standardize first bring numerical features on 0-1 scale or convert categorical to 0,1  

#### Question Follow up: What does standard scaler do?

You’re absolutely right:
👉 You should apply StandardScaler only to your original numeric features,
👉 and not to one-hot encoded categorical columns (from pd.get_dummies).


✅ Why we don’t scale one-hot encoded columns

One-hot columns are already 0s and 1s.
Scaling them would destroy their meaning — you’d end up with non-binary values that no longer represent categories.
The goal of scaling is to make numeric features comparable, not to modify encoded categories.
Standard scaler is (X-mu)/(std_dev)

In [8]:
X_numeric = X[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
X_categorical = X[['Sex','Embarked']]
X_neither = X['Cabin']

In [33]:
X_numeric.isna().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

In [13]:
X_onehot = pd.get_dummies(X_categorical,['Sex','Embarked']).astype(int)

In [14]:
X_onehot.head(3)

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1


In [20]:
X_cabin_nonna = X['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)

In [21]:
X_cabin_nonna.head(5)

0    0
1    1
2    0
3    1
4    0
Name: Cabin, dtype: int64

### Do train test split before applying standard scaler to avoid data leakage

In [23]:
X_encoded0 = pd.merge(X_numeric, X_cabin_nonna,  left_index=True, right_index=True)
X_encoded1 = pd.merge(X_encoded0, X_onehot,  left_index=True, right_index=True)

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded1, y, test_size=0.2, random_state=42, stratify=y
)


In [35]:
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [36]:
from sklearn.impute import SimpleImputer

# Imputer for numeric columns (e.g., Age)
imputer = SimpleImputer(strategy='median')  # or 'mean' if you prefer

# Fit on train, transform train
X_train['Age'] = imputer.fit_transform(X_train[['Age']])

# Transform test using same statistics
X_test['Age'] = imputer.transform(X_test[['Age']])


In [37]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [38]:
X_train_scaled[5,:]

array([ 0.82956755, -0.6573438 , -0.46508428, -0.46618317, -0.48696219,
       -0.5383819 , -0.74242727,  0.74242727, -0.49252705, -0.28933346,
        0.61631563])

### Logisitc Regression from Sklearn 

In [39]:
# Assuming you already have:
# X_train_scaled, X_test_scaled, y_train, y_test

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1️⃣ Create and train the model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

# 2️⃣ Make predictions
y_pred = model.predict(X_test_scaled)

# 3️⃣ Evaluate performance
print("✅ Logistic Regression Results")
print("--------------------------------")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


✅ Logistic Regression Results
--------------------------------
Accuracy: 0.8156

Confusion Matrix:
[[97 13]
 [20 49]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       110
           1       0.79      0.71      0.75        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.82      0.81       179



### Implementation of algorithm from scratch

#### Given X_train, t_train, X_test, y_test
#### Hyperparameter alpha- the learning rate, threshold -delta beyond which we can stop training
Derive the cost function analytically, solve the local optima which is also the global optima
Compute gradient of cost function

## Cost function: -yi log yi_hat - (1 - yi) log( 1- yi_hat)
The yis are not identical the ones outside the log function are ground truth
The ones inside are 1/(1 + e^ (-theta T X))


![Alt text](./logistic.png)


In [48]:
import numpy as np

# ----------------------------
# Helper functions
# ----------------------------

def sigmoid(z):
    """Sigmoid activation function."""
    return 1 / (1 + np.exp(-z))

def compute_cost(y_true, y_pred):
    """Binary cross-entropy / log loss."""
    m = len(y_true)
    # To avoid log(0), clip predictions
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    cost = - (1/m) * np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return cost

def predict_probs(X, w, b):
    """Compute predicted probabilities."""
    return sigmoid(np.dot(X, w) + b)

def predict_classes(X, w, b, threshold=0.5):
    """Predict class labels (0 or 1) given threshold."""
    probs = predict_probs(X, w, b)
    return (probs >= threshold).astype(int)

# ----------------------------
# Logistic Regression from scratch
# ----------------------------

def logistic_regression_train(X_train, y_train, alpha=0.01, threshold=1e-6, max_iter=20000):
    """
    Train logistic regression using gradient descent.
    
    Parameters:
    X_train: np.array, shape (m, n)
    y_train: np.array, shape (m,)
    alpha: learning rate
    threshold: minimum change in cost for stopping
    max_iter: maximum iterations
    
    Returns:
    w, b : trained weights and bias
    """
    m, n = X_train.shape
    # Initialize weights and bias
    w = np.random.randn(n) * 0.01
    b = 0.0
    
    prev_cost = float('inf')
    
    for i in range(max_iter):
        # 1️⃣ Forward pass
        y_pred = predict_probs(X_train, w, b)
        
        # 2️⃣ Compute gradients
        dw = (1/m) * np.dot(X_train.T, (y_pred - y_train))
        db = (1/m) * np.sum(y_pred - y_train)
        
        # 3️⃣ Update weights and bias
        w -= alpha * dw
        b -= alpha * db
        
        # 4️⃣ Compute cost and check stopping criterion
        cost = compute_cost(y_train, y_pred)
        if abs(prev_cost - cost) < threshold:
            print(f"Converged at iteration {i}, cost={cost:.6f}")
            break
        prev_cost = cost
        
        # Optional: print every 1000 iterations
        if i % 1000 == 0:
            print(f"Iteration {i}, cost={cost:.6f}")
    
    return w, b

# ----------------------------
# Train the model
# ----------------------------
w, b = logistic_regression_train(X_train_scaled, y_train, alpha=0.01, threshold=1e-7)

# ----------------------------
# Predict on test data
# ----------------------------
y_pred_test = predict_classes(X_test_scaled, w, b)


# Optional: Evaluate accuracy
accuracy = np.mean(y_pred_test == y_test)
print(f"Test accuracy: {accuracy:.4f}")




Iteration 0, cost=0.699835
Iteration 1000, cost=0.437621
Iteration 2000, cost=0.432054
Iteration 3000, cost=0.431007
Iteration 4000, cost=0.430683
Converged at iteration 4765, cost=0.430575
Test accuracy: 0.8045
