In [1]:
import numpy as np
import pandas as pd

In [3]:
# --- 1. Define Logistic Regression Functions ---

def sigmoid(z):
    """
    Compute the sigmoid of z.
    Clips z to prevent overflow in exp.
    """
    # Clip z to prevent overflow
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def loss(y, y_hat):
    """
    Compute the Log Loss (Binary Cross-Entropy).
    """
    # Small epsilon to avoid log(0)
    epsilon = 1e-15
    y_hat = np.clip(y_hat, epsilon, 1 - epsilon) # Clip predictions
    m = y.shape[0]
    cost = -np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat)) / m
    return cost

def gradients(X, y, y_hat):
    """
    Compute the gradients of the loss with respect to weights and bias.
    """
    m = X.shape[0] # Number of examples
    dw = (1 / m) * np.dot(X.T, (y_hat - y)) # Gradient w.r.t weights
    db = (1 / m) * np.sum(y_hat - y)        # Gradient w.r.t bias
    return dw, db

def train(X, y, bs, epochs, lr):
    """
    Train the Logistic Regression model.
    """
    # Initialize weights and bias
    n_features = X.shape[1]
    w = np.zeros((n_features, 1)) # Weights
    b = 0                         # Bias

    # Reshape y to be a column vector
    y = y.reshape(X.shape[0], 1)

    # Training loop
    costs = []
    for epoch in range(epochs):
        # Forward pass: compute predicted y
        z = np.dot(X, w) + b
        y_hat = sigmoid(z)

        # Compute cost
        cost = loss(y, y_hat)
        costs.append(cost)

        # Backward pass: compute gradients
        dw, db = gradients(X, y, y_hat)

        # Update parameters
        w -= lr * dw
        b -= lr * db

        # Print cost every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Cost: {cost}")

    # Return final parameters
    return w, b

def predict(X, w, b):
    """
    Predict labels for dataset X using learned parameters w and b.
    """
    z = np.dot(X, w) + b
    y_hat = sigmoid(z)
    # Convert probabilities to class predictions (0 or 1)
    preds = (y_hat > 0.5).astype(int)
    return preds.flatten() # Return as 1D array

In [5]:
# --- 2. Load Data ---

print("Loading data...")
# Load training data
train_df = pd.read_csv('train_tfidf_features.csv')
X_train = train_df.drop(['id', 'label'], axis=1).values # Features (columns 0-4999)
y_train = train_df['label'].values                       # Labels

# Load test data
test_df = pd.read_csv('test_tfidf_features.csv')
X_test = test_df.drop(['id'], axis=1).values # Features (columns 0-4999)
test_ids = test_df['id'].values              # Test IDs for submission

print(f"Training set shape: {X_train.shape}, Labels shape: {y_train.shape}")
print(f"Test set shape: {X_test.shape}")

# --- 3. Train the Model ---
print("\nTraining Logistic Regression model...")
# Hyperparameters (you can experiment with these)
learning_rate = 0.01
num_epochs = 1000 # Increased epochs for better convergence
# Note: Batch size (bs) is not used in this simple implementation,
# but it's a parameter for the train function as requested.
# We'll use the full batch (gradient descent).
w_opt, b_opt = train(X_train, y_train, bs=None, epochs=num_epochs, lr=learning_rate)

# --- 4. Make Predictions on Test Set ---
print("\nMaking predictions on test set...")
test_predictions = predict(X_test, w_opt, b_opt)

# --- 5. Create Submission File ---
print("\nCreating submission file: LogRed_Prediction.csv")
submission_df = pd.DataFrame({
    'id': test_ids,
    'label': test_predictions
})
# Save to CSV without index
submission_df.to_csv('LogRed_Prediction.csv', index=False)
print("Submission file 'LogRed_Prediction.csv' created successfully!")

Loading data...
Training set shape: (17184, 5000), Labels shape: (17184,)
Test set shape: (4296, 5000)

Training Logistic Regression model...
Epoch 0, Cost: 0.6931471805599453
Epoch 100, Cost: 0.6818685756190891
Epoch 200, Cost: 0.6750098654665818
Epoch 300, Cost: 0.6708079172240665
Epoch 400, Cost: 0.668206170014174
Epoch 500, Cost: 0.6665716538876123
Epoch 600, Cost: 0.6655240744963634
Epoch 700, Cost: 0.6648340128593144
Epoch 800, Cost: 0.6643624730359983
Epoch 900, Cost: 0.6640249036359575

Making predictions on test set...

Creating submission file: LogRed_Prediction.csv
Submission file 'LogRed_Prediction.csv' created successfully!
