# Import Libraries

In [1]:
import numpy as np

# Logistic Regression with TF-IDF Feature Extraction

In this notebook we implement a logistic regression binary classifier performing feature extraction using TF-IDF
For the logistic regression model we implemented the options for L1 and L2 regularization (though not yet elastic-net)

In [2]:
class LogisticRegression:
    """
    Logistic Regression classifier.

    Parameters:
    -----------
    learning_rate: Learning rate for gradient descent.
    n_iters: Number of iterations for training the model.
    regularization: 'L1', 'L2', or None.
    strength: Regularization strength.
    """
    def __init__(self, learning_rate=0.01, n_iters=1000, regularization=None, strength=0.01):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.regularization = regularization
        self.strength = strength
        self.weights = None
        self.bias = None

    def sigmoid(self, array):
        # Sigmoid function
        return 1 / (1 + np.exp(-array))

    def loss(self, y_true, y_pred):
        # Cross-entropy loss with optional regularization
        e = 1e-15  # Prevent log(0)
        y_pred = np.clip(y_pred, e, 1 - e)
        n_samples = y_true.shape[0]
        # Cross-entropy loss
        loss = - (1 / n_samples) *  np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        # Regularization
        if self.regularization is not None and self.weights is not None:
            if self.regularization == 'L1':
                # L1 regularization
                loss += (self.strength / n_samples) * np.sum(np.abs(self.weights))
            elif self.regularization == 'L2':
                # L2 regularization
                loss += (self.strength / (2 * n_samples)) * np.sum(self.weights ** 2)
        return loss

    def fit(self, X, y):
        """
        Fit the logistic regression model.

        Parameters:
        - X: Features (NumPy array of shape (n_samples, n_features)).
        - y: Labels (NumPy array of shape (n_samples,)).
        """
        n_samples, n_features = X.shape

        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0.0

        for i in range(self.n_iters):
            # Linear model
            linear_model = np.dot(X, self.weights) + self.bias
            # Predictions
            y_pred = self.sigmoid(linear_model)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            # Regularization
            if self.regularization == 'L1':
                dw += (self.strength / n_samples) * np.sign(self.weights)
            elif self.regularization == 'L2':
                dw += (self.strength / n_samples) * self.weights

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            # Print loss every 100 iterations
            if i % 100 == 0:
                current_loss = self.loss(y, y_pred)
                print(f'Loss after iteration {i}: {current_loss}')

    def probability(self, X):
        # Predict the probability of the input X
        linear_model = np.dot(X, self.weights) + self.bias
        return self.sigmoid(linear_model)

    def predict(self, X, decision_boundary=0.5):
        # Predict the class labels
        prob = self.probability(X)
        return (prob >= decision_boundary).astype(int)


# TF, IDF, Macro F1, Train and Test Splitting

In [3]:
def tf(term_counts):
    """
    Compute term frequency (TF).

    Parameters:
    - term_counts: NumPy array of shape (n_samples, n_features)

    Returns:
    - TF matrix.
    """
    # Sum of terms in each document
    doc_sum = np.sum(term_counts, axis=1, keepdims=True)
    # Avoid division by zero
    doc_sum[doc_sum == 0] = 1
    # Compute TF
    return term_counts / doc_sum

def idf(term_counts):
    """
    Compute inverse document frequency (IDF).

    Parameters:
    - term_counts: NumPy array of shape (n_samples, n_features)

    Returns:
    - IDF vector.
    """
    n_samples = term_counts.shape[0]
    # Document frequency: number of documents that contain each term
    df = np.count_nonzero(term_counts > 0, axis=0)
    # Avoid division by zero
    df[df == 0] = 1
    # Compute IDF
    return np.log(n_samples / df)

def tf_idf(idf_values, tf_values):
    """
    Compute TF-IDF matrix.

    Parameters:
    - idf_values: NumPy array of shape (n_features,)
    - tf_values: NumPy array of shape (n_samples, n_features)

    Returns:
    - TF-IDF matrix.
    """
    return tf_values * idf_values

def train_test_split(X, y, test_size=0.2, random_state=None):
    """
    Split arrays or matrices into random train and test subsets.

    Parameters:
    - X: Features.
    - y: Labels.
    - test_size: Proportion of the dataset to include in the test split.
    - random_state: Seed for reproducibility.

    Returns:
    - X_train, X_test, y_train, y_test
    """
    if random_state is not None:
        np.random.seed(random_state)
    # Shuffle data
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    # Split the data
    split_idx = int(X.shape[0] * (1 - test_size))
    X_train = X_shuffled[:split_idx]
    X_test = X_shuffled[split_idx:]
    y_train = y_shuffled[:split_idx]
    y_test = y_shuffled[split_idx:]
    return X_train, X_test, y_train, y_test

def macro_f1(y_true, y_pred):
    """
    Compute the macro F1 score.

    Parameters:
    - y_true: True labels.
    - y_pred: Predicted labels.

    Returns:
    - Macro F1 Score.
    """
    classes = np.unique(y_true)
    f1_scores = []

    for c in classes:
        # True positives, false positives, false negatives
        tp = np.sum((y_true == c) & (y_pred == c))
        fp = np.sum((y_true != c) & (y_pred == c))
        fn = np.sum((y_true == c) & (y_pred != c))

        # Avoid division by zero
        e = 1e-15
        precision = tp / (tp + fp + e)
        recall = tp / (tp + fn + e)

        # F1 score
        f1 = 2 * (precision * recall) / (precision + recall + e)
        f1_scores.append(f1)

    # Return the average F1 score
    return np.mean(f1_scores)


# Load and Preprocess Data

We load the training and test datasets, get the TF-IDF features, and scale them

In [4]:
# Load the data
data_train = np.load('data_train.npy', allow_pickle=True)
data_test = np.load('data_test.npy', allow_pickle=True)
labels_train = np.loadtxt('label_train.csv', delimiter=',', skiprows=1, usecols=1)

# Compute term frequencies
tf_train = tf(data_train)
tf_test = tf(data_test)

# Compute inverse document frequencies from training data
idf_train = idf(data_train)

# Compute TF-IDF features
tfidf_train = tf_idf(idf_train, tf_train)
tfidf_test = tf_idf(idf_train, tf_test)

# Feature scaling: Standardize the features
mean = np.mean(tfidf_train, axis=0)
std = np.std(tfidf_train, axis=0)
std[std == 0] = 1  # Avoid division by zero

# Scale features
tfidf_train_scaled = (tfidf_train - mean) / std
tfidf_test_scaled = (tfidf_test - mean) / std


# Split Data into Training and Validation Sets

In [5]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    tfidf_train_scaled,
    labels_train,
    test_size=0.2,
    random_state=42
)


# Hyperparameter Tuning

We tune hyperparameters such as the type of regularization used (none, L1, L2) and strength of regularization to find the best option.

In [6]:
# Hyperparameters
learning_rate = 0.01
reg_types = [None, 'L1', 'L2']
reg_strengths = [0.001, 0.01, 0.1]
n_iters = 1000

best_f1 = 0
best_regularization = None
best_strength = None
best_model = None

for reg in reg_types:
    for strength in reg_strengths:
        print(f"\nTraining model with regularization: {reg}, strength: {strength}")
        model = LogisticRegression(
            learning_rate=learning_rate,
            n_iters=n_iters,
            regularization=reg,
            strength=strength
        )
        model.fit(X_train, y_train)
        val_predictions = model.predict(X_val)

        f1 = macro_f1(y_val, val_predictions)
        print(f"Validation Macro F1 Score with regularization {reg}, strength {strength}: {f1:.4f}")
        if f1 > best_f1:
            best_f1 = f1
            best_regularization = reg
            best_strength = strength
            best_model = model



Training model with regularization: None, strength: 0.001
Loss after iteration 0: 0.6931471805599451
Loss after iteration 100: 0.3255051143593493
Loss after iteration 200: 0.22353625588996912
Loss after iteration 300: 0.17085016521873816
Loss after iteration 400: 0.1384307609948136
Loss after iteration 500: 0.11641004528338861
Loss after iteration 600: 0.1004488980248962
Loss after iteration 700: 0.08833546980923004
Loss after iteration 800: 0.07882200716426395
Loss after iteration 900: 0.07114996343554418
Validation Macro F1 Score with regularization None, strength 0.001: 0.4916

Training model with regularization: None, strength: 0.01
Loss after iteration 0: 0.6931471805599451
Loss after iteration 100: 0.3255051143593493
Loss after iteration 200: 0.22353625588996912
Loss after iteration 300: 0.17085016521873816
Loss after iteration 400: 0.1384307609948136
Loss after iteration 500: 0.11641004528338861
Loss after iteration 600: 0.1004488980248962
Loss after iteration 700: 0.0883354698

# Get the best model

In [7]:
print(f"\nBest regularization: {best_regularization}")
print(f"Best regularization strength: {best_strength}")
print(f"Best validation Macro F1 Score: {best_f1:.4f}")



Best regularization: L1
Best regularization strength: 0.1
Best validation Macro F1 Score: 0.4935


# Retrain the Best Model on the Full Training Data

In [8]:
# Retrain the best model on the full training data
best_model_full = LogisticRegression(
    learning_rate=learning_rate,
    n_iters=n_iters,
    regularization=best_regularization,
    strength=best_strength
)
best_model_full.fit(tfidf_train_scaled, labels_train)


Loss after iteration 0: 0.6931592119807853
Loss after iteration 100: 0.37050524705463544
Loss after iteration 200: 0.2695306107363191
Loss after iteration 300: 0.21295561690775755
Loss after iteration 400: 0.17637312672339492
Loss after iteration 500: 0.150727986596438
Loss after iteration 600: 0.13173935798157227
Loss after iteration 700: 0.11710604568372056
Loss after iteration 800: 0.10547932237659026
Loss after iteration 900: 0.0960165252807168


# Predict on the Test Set

In [9]:
# Predict on the test set
test_predictions = best_model_full.predict(tfidf_test_scaled)


# Save Predictions to CSV

The predictions are then saved for submission

In [10]:
# Prepare the data for saving
i = np.arange(len(test_predictions))
output = np.column_stack((i, test_predictions))

# Save predictions to a CSV file
header = 'Id,Label'
np.savetxt('predictions.csv', output, delimiter=',', header=header, comments='', fmt='%d')
print("\nPredictions saved to predictions.csv")



Predictions saved to predictions.csv


# Check CSV Rows

In [11]:
# Load the CSV file
data = np.genfromtxt('predictions.csv', delimiter=',', skip_header=1)

# Get the number of rows
num_rows = data.shape[0]

print(f'The CSV file has {num_rows} rows.')

The CSV file has 2356 rows.
