In [21]:
from helpers import *
import numpy as np
from implementations import *
import pandas as pd

In [22]:
def load_csv_data(data_path, sub_sample=False):
    """
    This function loads the data and returns the respectinve numpy arrays.
    Remember to put the 3 files in the same folder and to not change the names of the files.

    Args:
        data_path (str): datafolder path
        sub_sample (bool, optional): If True the data will be subsempled. Default to False.

    Returns:
        x_train (np.array): training data
        x_test (np.array): test data
        y_train (np.array): labels for training data in format (-1,1)
        train_ids (np.array): ids of training data
        test_ids (np.array): ids of test data
    """
    y_train = np.genfromtxt(
        os.path.join(data_path, "y_train.csv"),
        delimiter=",",
        skip_header=1,
        dtype=int,
        usecols=1,
    )
    x_train = np.genfromtxt(
        os.path.join(data_path, "x_train_cleaned.csv"), delimiter=",", skip_header=1
    )
    x_test = np.genfromtxt(
        os.path.join(data_path, "x_test_cleaned.csv"), delimiter=",", skip_header=1
    )

    train_ids = x_train[:, 0].astype(dtype=int)
    test_ids = x_test[:, 0].astype(dtype=int)
    x_train = x_train[:, 1:]
    x_test = x_test[:, 1:]

    # sub-sample
    if sub_sample:
        y_train = y_train[::50]
        x_train = x_train[::50]
        train_ids = train_ids[::50]

    return x_train, x_test, y_train, train_ids, test_ids

In [23]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("dataset", sub_sample=False)


In [24]:
initial_w=np.zeros((x_train.shape[1],))
gamma=0.1
max_iters=100
y_train = y_train[train_ids]

In [25]:
def accuracy(y_true, y_pred):
    """
    Calculates the accuracy between true labels and predicted labels.

    Parameters:
    y_true (numpy array): True labels
    y_pred (numpy array): Predicted labels

    Returns:
    float: Accuracy score
    """
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def f1_score(y_true, y_pred):
    """
    Calculates the F1-score between true labels and predicted labels.

    Parameters:
    y_true (numpy array): True labels
    y_pred (numpy array): Predicted labels

    Returns:
    float: F1-score
    """
    # True Positives (TP)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    # False Positives (FP)
    fp = np.sum((y_true == 0) & (y_pred == 1))
    # False Negatives (FN)
    fn = np.sum((y_true == 1) & (y_pred == 0))

    # Avoid division by zero
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0

    if (precision + recall) == 0:
        return 0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [26]:
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [27]:
def compute_gradient_mse2(y, tx, w):
    """Compute the gradient."""
    N = len(y)
    error = y - tx.dot(w)
    gradient = -1 / N * tx.T.dot(error)
    return gradient

def compute_loss_mse2(y, tx, w):
    """Calculate the loss using mse."""
    N = len(y)
    squared_error = (y - tx.dot(w)) ** 2
    loss = 1 / (2 * N) * np.sum(squared_error)
    return loss

def mean_squared_error_gd2(y, tx, initial_w, max_iters, gamma):
    """Calculate the loss using mse"""
    if max_iters == 0:
        return (initial_w, compute_loss_mse2(y, tx, initial_w))
    w = initial_w
    # loss = compute_loss_mse(y, tx, w)
    for iter in range(max_iters):
        gradient = compute_gradient_mse2(y, tx, w)
        w = w - gamma * gradient
        if iter % 100 == 0:
            print(
                "Current iteration={i}, loss={l}".format(
                    i=iter, l=compute_loss_mse2(y, tx, w)
                )
            )
    loss = compute_loss_mse2(y, tx, w)
    return (w, loss)

In [31]:

def optimize_threshold(y_true, y_scores):
    """
    Finds the threshold that maximizes the F1 score.

    Parameters:
    y_true (numpy array): True labels (-1 or 1)
    y_scores (numpy array): Predicted scores from the model

    Returns:
    best_threshold (float): Threshold that gives the highest F1 score
    best_f1 (float): The highest F1 score achieved
    """
    # Initialize variables
    best_threshold = None
    best_f1 = -1

    # Define thresholds based on percentiles to avoid extreme values
    score_min = np.percentile(y_scores, 5)
    score_max = np.percentile(y_scores, 95)
    thresholds = np.linspace(score_min, score_max, 100)

    # Iterate over thresholds to find the best one
    for threshold in thresholds:
        # Convert scores to binary predictions using the threshold
        y_pred = np.where(y_scores >= threshold, 1, -1)

        # Compute the F1 score for these predictions
        current_f1 = f1_score(y_true, y_pred)

        # Update best threshold if current F1 is better
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_threshold = threshold

    return best_threshold, best_f1





Best Threshold: -1.0452428976193437
Accuracy with optimized threshold: 0.13686440032303776
F1 Score with optimized threshold: 1.0


In [32]:



# Initialize weights
initial_w = np.zeros(X_train.shape[1])

# Train your model
w, loss = mean_squared_error_gd2(y_train, X_train, initial_w, 500, 0.001)

# Compute predicted scores on the test set
y_scores = X_test @ w


# Optimize threshold
best_threshold, best_f1 = optimize_threshold(y_test, y_scores)

# Make final predictions
y_prediction = np.where(y_scores >= best_threshold, 1, -1)

# Evaluate performance
accuracy_score = accuracy(y_test, y_prediction)
final_f1_score = f1_score(y_test, y_prediction)

print("Best Threshold:", best_threshold)
print("Accuracy with optimized threshold:", accuracy_score)
print("F1 Score with optimized threshold:", final_f1_score)


Current iteration=0, loss=0.4725981665543393
Current iteration=100, loss=0.15143367794530063
Current iteration=200, loss=0.14740982820692666
Current iteration=300, loss=0.14504751612216937
Current iteration=400, loss=0.14353656330365666
Best Threshold: -0.6331050033031871
Accuracy with optimized threshold: 0.8472884635896811
F1 Score with optimized threshold: 0.36279247202441506


In [35]:

# Define a range of gamma values to test
gamma_values = [0.001, 0.005, 0.01, 0.015, 0.02]

# Initialize variables to store the best parameters
best_gamma = None
best_threshold = None
best_f1 = -1
best_w = None

# Function to optimize threshold
def optimize_threshold(y_true, y_scores):
    best_threshold = None
    best_f1 = -1

    # Generate a list of potential thresholds to try
    thresholds = np.linspace(np.min(y_scores), np.max(y_scores), 100)

    for threshold in thresholds:
        y_pred = np.where(y_scores >= threshold, 1, -1)
        current_f1 = f1_score(y_true, y_pred)
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_threshold = threshold

    return best_threshold, best_f1

# Loop over gamma values
for gamma in gamma_values:
    print(f"Testing gamma: {gamma}")
    # Initialize weights
    initial_w = np.zeros(X_train.shape[1])
    # Train the model
    w, loss = mean_squared_error_gd2(y_train, X_train, initial_w, max_iters=1000, gamma=gamma)

    # Compute predicted scores on the validation set
    y_scores = X_test @ w

    # Optimize threshold for the current gamma
    threshold, f1 = optimize_threshold(y_test, y_scores)

    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}")

    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_gamma = gamma
        best_threshold = threshold
        best_w = w.copy()

# After finding the best gamma and threshold, make predictions on the test set
y_scores = X_test @ best_w
y_prediction = np.where(y_scores >= best_threshold, 1, -1)

# Compute final accuracy and F1 score
final_accuracy = accuracy(y_test, y_prediction)
final_f1_score = f1_score(y_test, y_prediction)

print("Best Gamma:", best_gamma)
print("Best Threshold:", best_threshold)
print("Final Accuracy:", final_accuracy)
print("Final F1 Score:", final_f1_score)



Testing gamma: 0.001
Current iteration=0, loss=0.4725981665543393
Current iteration=100, loss=0.15143367794530063
Current iteration=200, loss=0.14740982820692666
Current iteration=300, loss=0.14504751612216937
Current iteration=400, loss=0.14353656330365666
Current iteration=500, loss=0.14249765162213124
Current iteration=600, loss=0.1417322417086159
Current iteration=700, loss=0.1411342887544156
Current iteration=800, loss=0.14064549769620782
Current iteration=900, loss=0.14023255044107097
Gamma: 0.001, Best Threshold: -0.5920280549360802, F1 Score: 0.38206470667203657
Testing gamma: 0.005
Current iteration=0, loss=0.37436983598483686
Current iteration=100, loss=0.14244954571221327
Current iteration=200, loss=0.13985786968584088
Current iteration=300, loss=0.1385871011683183
Current iteration=400, loss=0.13778058231562196
Current iteration=500, loss=0.13721341385677008
Current iteration=600, loss=0.13678716133870586
Current iteration=700, loss=0.13645124746421297
Current iteration=800