In [1]:
import numpy as np
from helpers import *
from implementations import *
from helpers_analysis import *

In [25]:
x_train, x_test, y_train, train_ids, test_ids = load_cleaned_csv_data("dataset", sub_sample=False)
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Gradient Descent

In [26]:
# Define a range of gamma values to test
gamma_values = [0.001, 0.005, 0.01, 0.015, 0.02]

# Initialize variables to store the best parameters
best_gamma = None
best_threshold = None
best_f1 = 0
best_w = None

# Loop over gamma values
for gamma in gamma_values:
    print(f"Testing gamma: {gamma}")
    # Initialize weights
    initial_w = np.zeros(X_train.shape[1])
    # Train the model
    w, loss = mean_squared_error_gd(y_train, X_train, initial_w, max_iters=1000, gamma=gamma)

    # Compute predicted scores on the validation set
    y_scores = X_test @ w

    # Optimize threshold for the current gamma
    threshold, f1 = optimize_threshold(y_test, y_scores)

    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}")

    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_gamma = gamma
        best_threshold = threshold
        best_w = w.copy()

# After finding the best gamma and threshold, make predictions on the test set
y_scores = X_test @ best_w
y_prediction = np.where(y_scores >= best_threshold, 1, -1)

# Compute final accuracy and F1 score
final_accuracy = accuracy(y_test, y_prediction)
final_f1_score = f1_score(y_test, y_prediction)

print("Best Gamma:", best_gamma)
print("Best Threshold:", best_threshold)
print("Final Accuracy:", final_accuracy)
print("Final F1 Score:", final_f1_score)


Testing gamma: 0.001
Current iteration=0, loss=0.4725981665543393
Current iteration=100, loss=0.15143367794530063
Current iteration=200, loss=0.14740982820692666
Current iteration=300, loss=0.14504751612216937
Current iteration=400, loss=0.14353656330365666
Current iteration=500, loss=0.14249765162213124
Current iteration=600, loss=0.1417322417086159
Current iteration=700, loss=0.1411342887544156
Current iteration=800, loss=0.14064549769620782
Current iteration=900, loss=0.14023255044107097
Gamma: 0.001, Best Threshold: -0.5920280549360802, F1 Score: 0.38206470667203657
Testing gamma: 0.005
Current iteration=0, loss=0.37436983598483686
Current iteration=100, loss=0.14244954571221327
Current iteration=200, loss=0.13985786968584088
Current iteration=300, loss=0.1385871011683183
Current iteration=400, loss=0.13778058231562196
Current iteration=500, loss=0.13721341385677008
Current iteration=600, loss=0.13678716133870586
Current iteration=700, loss=0.13645124746421297
Current iteration=800

# Stochastic Gradient Descent

In [31]:
# Define a range of gamma values to test

gamma_values = [0.0015]

# Initialize variables to store the best parameters
best_gamma = None
best_threshold = None
best_f1 = 0
best_w = None

# Loop over gamma values
for gamma in gamma_values:
    print(f"Testing gamma: {gamma}")
    # Initialize weights
    initial_w = np.zeros(X_train.shape[1])
    # Train the model
    w, loss = mean_squared_error_sgd(y_train, X_train, initial_w, max_iters=3000, gamma=gamma)

    # Compute predicted scores on the validation set
    y_scores = X_test @ w

    # Optimize threshold for the current gamma
    threshold, f1 = optimize_threshold(y_test, y_scores)

    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}")

    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_gamma = gamma
        best_threshold = threshold
        best_w = w.copy()

# After finding the best gamma and threshold, make predictions on the test set
y_scores = X_test @ best_w
y_prediction = np.where(y_scores >= best_threshold, 1, -1)

# Compute final accuracy and F1 score
final_accuracy = accuracy(y_test, y_prediction)
final_f1_score = f1_score(y_test, y_prediction)

print("Best Gamma:", best_gamma)
print("Best Threshold:", best_threshold)
print("Final Accuracy:", final_accuracy)
print("Final F1 Score:", final_f1_score)


Testing gamma: 0.0015
Current iteration=0, loss=0.4464259998806827


KeyboardInterrupt: 

# Least Squares

In [28]:
w, loss = least_squares(y_train, X_train)

y_scores = X_test @ w

# Optimize threshold for the current gamma
threshold, f1 = optimize_threshold(y_test, y_scores)

# After finding the best gamma and threshold, make predictions on the test set
y_scores = X_test @ w
y_prediction = np.where(y_scores >= threshold, 1, -1)

# Compute final accuracy and F1 score
final_accuracy = accuracy(y_test, y_prediction)
final_f1_score = f1_score(y_test, y_prediction)

print("Best Threshold:", threshold)
print("Final Accuracy:", final_accuracy)
print("Final F1 Score:", final_f1_score)

Best Threshold: -0.5655146528668263
Final Accuracy: 0.8670821460679293
Final F1 Score: 0.4072035338090384


  w=np.linalg.lstsq(A,b)[0]


# Ridge Regression

In [29]:
w, _ = ridge_regression(y_train, X_train, lambda_=0.001)

# Logistic Regression

In [None]:
from helpers_analysis import *
# Convert y_train and y_test to binary labels (0 and 1)
y_train_binary = (y_train + 1) // 2
y_test_binary = (y_test + 1) // 2

# Define a range of gamma values to test
gamma_values = [0.001,0.005,0.01,0.015,0.02]

# Initialize variables to store the best parameters
best_gamma = None
best_threshold = None
best_f1 = 0
best_w = None

# Loop over gamma values
for gamma in gamma_values:
    print(f"Testing gamma: {gamma}")
    # Initialize weights
    initial_w = np.zeros(X_train.shape[1])
    # Train the model
    w, loss = logistic_regression(y_train_binary, X_train, initial_w, max_iters=1000, gamma=gamma)
    
    # Compute predicted probabilities on the validation set
    y_scores = sigmoid(X_test @ w)
    
    # Optimize threshold for the current gamma
    threshold, f1 = optimize_threshold_logistic(y_test_binary, y_scores)
    
    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}")
    
    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_gamma = gamma
        best_threshold = threshold
        best_w = w.copy()

# After finding the best gamma and threshold, make predictions on the test set
y_scores = sigmoid(X_test @ best_w)
y_prediction_binary = np.where(y_scores >= best_threshold, 1, 0)
y_prediction = y_prediction_binary * 2 - 1  # Convert back to -1 and 1

# Compute final accuracy and F1 score
final_accuracy = accuracy(y_test, y_prediction)
final_f1_score = f1_score_logistic(y_test_binary, y_prediction_binary)

print("Best Gamma:", best_gamma)
print("Best Threshold:", best_threshold)
print("Final Accuracy:", final_accuracy)
print("Final F1 Score:", final_f1_score)


Testing gamma: 0.001
Current iteration=0, loss=0.686190043604647
Current iteration=100, loss=0.3813197240800137
Current iteration=200, loss=0.3189964180323503
Current iteration=300, loss=0.29846338414482615
Current iteration=400, loss=0.2893554646429806
