In [3]:
from helpers import *
import numpy as np
from implementations import *
import pandas as pd

In [4]:
def load_csv_data(data_path, sub_sample=False):
    """
    This function loads the data and returns the respectinve numpy arrays.
    Remember to put the 3 files in the same folder and to not change the names of the files.

    Args:
        data_path (str): datafolder path
        sub_sample (bool, optional): If True the data will be subsempled. Default to False.

    Returns:
        x_train (np.array): training data
        x_test (np.array): test data
        y_train (np.array): labels for training data in format (-1,1)
        train_ids (np.array): ids of training data
        test_ids (np.array): ids of test data
    """
    y_train = np.genfromtxt(
        os.path.join(data_path, "y_train.csv"),
        delimiter=",",
        skip_header=1,
        dtype=int,
        usecols=1,
    )
    x_train = np.genfromtxt(
        os.path.join(data_path, "x_train_cleaned.csv"), delimiter=",", skip_header=1
    )
    x_test = np.genfromtxt(
        os.path.join(data_path, "x_test_cleaned.csv"), delimiter=",", skip_header=1
    )

    train_ids = x_train[:, 0].astype(dtype=int)
    test_ids = x_test[:, 0].astype(dtype=int)
    x_train = x_train[:, 1:]
    x_test = x_test[:, 1:]

    # sub-sample
    if sub_sample:
        y_train = y_train[::50]
        x_train = x_train[::50]
        train_ids = train_ids[::50]

    return x_train, x_test, y_train, train_ids, test_ids

In [41]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("dataset", sub_sample=False)


In [42]:
initial_w=np.zeros((x_train.shape[1],))
gamma=0.1
max_iters=100
y_train = y_train[train_ids]

In [44]:
def accuracy(y_true, y_pred):
    """
    Calculates the accuracy between true labels and predicted labels.

    Parameters:
    y_true (numpy array): True labels
    y_pred (numpy array): Predicted labels

    Returns:
    float: Accuracy score
    """
    correct = np.sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

def f1_score(y_true, y_pred):
    """
    Calculates the F1-score between true labels and predicted labels.

    Parameters:
    y_true (numpy array): True labels
    y_pred (numpy array): Predicted labels

    Returns:
    float: F1-score
    """
    # True Positives (TP)
    tp = np.sum((y_true == 1) & (y_pred == 1))
    # False Positives (FP)
    fp = np.sum((y_true == 0) & (y_pred == 1))
    # False Negatives (FN)
    fn = np.sum((y_true == 1) & (y_pred == 0))

    # Avoid division by zero
    precision = tp / (tp + fp) if (tp + fp) != 0 else 0
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0

    if (precision + recall) == 0:
        return 0

    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [45]:
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [72]:
def compute_gradient_mse2(y, tx, w):
    """Compute the gradient."""
    N = len(y)
    error = y - tx.dot(w)
    gradient = -1 / N * tx.T.dot(error)
    return gradient

def compute_loss_mse2(y, tx, w):
    """Calculate the loss using mse."""
    N = len(y)
    squared_error = (y - tx.dot(w)) ** 2
    loss = 1 / (2 * N) * np.sum(squared_error)
    return loss

def mean_squared_error_gd2(y, tx, initial_w, max_iters, gamma):
    """Calculate the loss using mse"""
    if max_iters == 0:
        return (initial_w, compute_loss_mse2(y, tx, initial_w))
    w = initial_w
    # loss = compute_loss_mse(y, tx, w)
    for iter in range(max_iters):
        gradient = compute_gradient_mse2(y, tx, w)
        w = w - gamma * gradient
        if iter % 100 == 0:
            print(
                "Current iteration={i}, loss={l}".format(
                    i=iter, l=compute_loss_mse2(y, tx, w)
                )
            )
    loss = compute_loss_mse2(y, tx, w)
    return (w, loss)

In [95]:
w, loss = mean_squared_error_gd2(y_train, X_train, initial_w, 500, 0.001)

Current iteration=0, loss=0.4725981665543405
Current iteration=100, loss=0.15143367794530063
Current iteration=200, loss=0.14740982820692666
Current iteration=300, loss=0.14504751612216937
Current iteration=400, loss=0.1435365633036567


In [96]:
y_prediction = X_test @ w
y_prediction = np.where(y_prediction > 0.6, 1, -1) 
score = accuracy(y_test, y_prediction)
score
#print(y_prediction)

0.9129321773050726

In [97]:
score = f1_score(y_test, y_prediction)
score

0