In [236]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iterations=1000, tolerance=1e-6, 
                 regularization=None, lambda_reg=0.01):
        """
        
        
        Parameters:
        - learning_rate: the step size for learning in gradient descent
        - max_iterations: Maximum number of training iterations
        - tolerance: Convergence threshold for cost function (where to say that the model has converged)
        - regularization: Type of regularization ('l1' reduce parameters, 'l2' makes weights smaller ,  or None)
        - lambda_reg: Regularization strength (how much it affects the model )
        
        """
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.tolerance = tolerance
        self.regularization = regularization
        self.lambda_reg = lambda_reg
        self.weights = None
        self.bias = None
        self.cost_history = [] # to track the cost function value over iterations

    def sigmoid(self, z):
        """
        σ(z) = 1 / (1 + e^(-z))
        """
        # Clip z to prevent overflow
        z = np.clip(z, -250, 250)
        return 1 / (1 + np.exp(-z))
    
    def compute_cost(self, y_true, y_pred):
        """
        J(θ) = -1/m * Σ[y*log(h) + (1-y)*log(1-h)] + regularization_term
        """
        m = len(y_true)
        # Add small epsilon to prevent log(0) errors
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        
        # Base logistic loss
        logistic_cost = -1/m * np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        
        # Add regularization term
        regularization_cost = 0
        #Depends on regularization type
        if self.regularization == 'l1':
            # L1 regularization: λ * Σ|w_i|
            regularization_cost = self.lambda_reg * np.sum(np.abs(self.weights))
        elif self.regularization == 'l2':
            # L2 regularization: λ * Σw_i²
            regularization_cost = self.lambda_reg * np.sum(self.weights ** 2)

        #combine both costs
        total_cost = logistic_cost + regularization_cost
        return total_cost

    #compute the regularization itself
    def compute_regularization_gradients(self):
       
        if self.regularization == 'l1':
            # L1: λ * sign(w)
            return self.lambda_reg * np.sign(self.weights)
        elif self.regularization == 'l2':
            # L2: 2λ * w
            return 2 * self.lambda_reg * self.weights
        else:
            return np.zeros_like(self.weights)
    
    def fit(self, X, y):
       
        # Initialize parameters
        m, n = X.shape
        self.weights = np.zeros(n)
        self.bias = 0
        self.cost_history = []
        
        
        
        # Gradient descent
        for i in range(self.max_iterations):
            # Forward pass
            z = X.dot(self.weights) + self.bias
            y_pred = self.sigmoid(z)
            
            # Compute cost (includes regularization)
            cost = self.compute_cost(y, y_pred)
            self.cost_history.append(cost)
            
            # Compute base gradients
            dw = (1/m) * X.T.dot(y_pred - y)
            db = (1/m) * np.sum(y_pred - y)
            
            # Add regularization gradients to weight gradients
            # no regularization for bias 
            if self.regularization:
                dw += self.compute_regularization_gradients()
                
            
            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            # Check for convergence if exceeded tolerance
            if i > 0 and abs(self.cost_history[-2] - self.cost_history[-1]) < self.tolerance:
                print(f"Converged after {i+1} iterations")
                break
        
        
    
    def predict_proba(self, X):
        """
        Predict class probabilities
        """
        z = X.dot(self.weights) + self.bias
        return self.sigmoid(z)
    # low threshold for spam detection to avoid important mails classified as spam
    def predict(self, X, threshold=0.3):  
        """
        Make binary predictions
        """
        probabilities = self.predict_proba(X)
        return (probabilities >= threshold).astype(int)
    def classification_report(self, X, y, threshold=0.5):
        """
        Compute precision, recall, and F1-score from scratch.
        """
        predictions = self.predict(X, threshold)
         # first matrix to help with the classification report 
        TP = FP = TN = FN = 0
        for true, pred in zip(y, predictions):
            if true == 1 and pred == 1:
           
                TP += 1
            elif true == 0 and pred == 0:
                TN += 1
            elif true == 0 and pred == 1:
                FP += 1
            elif true == 1 and pred == 0:
                FN += 1

    # Avoid division by zero
    #model accuracy to check overall performance
        accuracy = (TP + TN) / len(y)
        # precision to check for false positives
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        # recall to check for false negatives
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        # f1 score to check balance between precision and recall
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,

        }

    # a function to print the classification report
    def score(self, X, y):

        scorep= self.classification_report(X, y)

        print("Accuracy" , scorep['accuracy'])
        print("Precision" , scorep['precision'] )
        print("Recall" , scorep['recall'])
        print("F1 Score" , scorep['f1_score'])


## Read Data

In [237]:
import pandas as pd
df= pd.read_csv(r"D:\Downloads\MIA\Task3\f1-spam-detection\train.csv")
df.head()

Unnamed: 0,message_id,num_links,num_words,has_offer,sender_score,all_caps,is_spam
0,1,3,98,1,0.718607,0,0
1,2,0,170,0,0.698901,1,0
2,3,0,38,0,0.620466,0,0
3,4,0,116,0,0.701755,0,0
4,5,3,89,1,0.583621,1,1


## From EDA we find no needing for cleaning as our Data has no dublictes or nulls 
also we find that the data is imbalanced 
also there is correlation between num of links, has offer and out target (spam) so it`s better to use f1 score rather than accuracy 

In [238]:
from ydata_profiling import ProfileReport
profile=ProfileReport(df, title = "EDA Report", explorative=True)

profile.to_file("eda_reportC1.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:00<00:00, 201.54it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

split target from needed cols

In [239]:
x= df[['num_links', 'has_offer', 'num_words', 'all_caps','sender_score']]
y= df['is_spam']

split to train , test , validate 
better validate to fine tune the paramters 

In [240]:
x_f, x_test, y_f, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_f, y_f, test_size=0.2, random_state=42)

In [241]:
# Scale data to make sure there won`t be any error ` 
# Scale only features 
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)


## Hyperparameter Search  by a loop to see which paramters are better 

In [None]:
learning_rates = [0.001, 0.01, 0.1, 1]
max_iterations_list = [500, 1000]
regularizations = [None, "l1", "l2" ]
lambda_regs = [0.001, 0.01, 0.1, 1]

selection_metric = 'f1_score'  # Good for spam detection
best_val_metric = 0
best_model = None
best_params = None

for learning_rate in learning_rates:
    for max_iterations in max_iterations_list:
        for regularization in regularizations:
            for lambda_reg in lambda_regs if regularization else [None]:

                model = LogisticRegression(
                    learning_rate=learning_rate,
                    max_iterations=max_iterations,
                    regularization=regularization,
                    lambda_reg=lambda_reg if lambda_reg is not None else 0.01
                )
                model.fit(x_train_scaled, y_train)

                y_val_pred = model.predict(x_val_scaled)

                # Use accuracy for model selection
                report = model.classification_report(x_val_scaled, y_val)
                val_metric = report[selection_metric]

                if val_metric > best_val_metric:
                    best_val_metric = val_metric
                    best_model = model
                    best_params = {
                        'learning_rate': learning_rate,
                        'max_iterations': max_iterations,
                        'regularization': regularization,
                        'lambda_reg': lambda_reg
                    }


In [243]:

best_model.fit(x_train_scaled, y_train)


Converged after 403 iterations


In [244]:
best_model.classification_report(x_train_scaled, y_train)

{'accuracy': 0.9437990837696335,
 'precision': 0.773224043715847,
 'recall': 0.5206991720331187,
 'f1_score': 0.622319956019791}

In [245]:
best_model.classification_report(x_val_scaled, y_val)

{'accuracy': 0.9358638743455497,
 'precision': 0.780373831775701,
 'recall': 0.5284810126582279,
 'f1_score': 0.6301886792452831}

In [246]:
best_model.classification_report(x_test_scaled, y_test)

{'accuracy': 0.9450261780104712,
 'precision': 0.821256038647343,
 'recall': 0.4956268221574344,
 'f1_score': 0.6181818181818183}

## Load and use model on test file 

In [None]:
df_test= pd.read_csv(r"D:\Downloads\MIA\Task3\f1-spam-detection\test.csv")
x_test = df_test[['num_links', 'has_offer', 'num_words', 'all_caps', 'sender_score']]
x_test_scaled = scaler.transform(x_test)

y_test_pred = best_model.predict(x_test_scaled)

submission = pd.DataFrame({
    "message_id": df_test["message_id"],
    "is_spam": y_test_pred
})
submission.to_csv("submission.csv", index=False)