In [1]:
import numpy as np
from helpers import *
from implementations import *
from helpers_analysis import *

In [4]:
x_train, x_test, y_train, train_ids, test_ids = load_cleaned_csv_data("dataset", sub_sample=False)
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Gradient Descent

In [3]:
# Define a range of gamma values to test
gamma_values = [0.005, 0.01, 0.015, 0.02]

# Initialize variables to store the best parameters
best_gamma = None
best_threshold = None
best_f1 = 0
best_w = None

# Loop over gamma values
for gamma in gamma_values:
    print(f"Testing gamma: {gamma}")
    # Initialize weights
    initial_w = np.zeros(X_train.shape[1])
    # Train the model
    w, loss = mean_squared_error_gd(y_train, X_train, initial_w, max_iters=1000, gamma=gamma)

    # Compute predicted scores on the validation set
    y_scores = X_test @ w

    # Optimize threshold for the current gamma
    threshold, f1, acc = optimize_threshold(y_test, y_scores)

    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}, Accuracy: {acc}")

    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_gamma = gamma
        best_threshold = threshold
        best_w = w.copy()

# After finding the best gamma and threshold, make predictions on the test set
y_scores = X_test @ best_w
y_prediction = np.where(y_scores >= best_threshold, 1, -1)

# Compute final accuracy and F1 score
final_accuracy = accuracy(y_test, y_prediction)
final_f1_score = f1_score(y_test, y_prediction)

print("Best Gamma:", best_gamma)
print("Best Threshold:", best_threshold)
print("Final Accuracy:", final_accuracy)
print("Final F1 Score:", final_f1_score)


Testing gamma: 0.005
Current iteration=0, loss=0.3490784161307585
Current iteration=100, loss=0.1423381210160878
Current iteration=200, loss=0.13956501944015492
Current iteration=300, loss=0.13825976138033838
Current iteration=400, loss=0.13746060345020664
Current iteration=500, loss=0.13691135783591357
Current iteration=600, loss=0.1365027881955025
Current iteration=700, loss=0.13618111588770956
Current iteration=800, loss=0.1359173293030289
Current iteration=900, loss=0.13569461052491605
Gamma: 0.005, Best Threshold: -0.5874688035903483, F1 Score: 0.409984, Accuracy: 0.8595242811647645
Testing gamma: 0.01
Current iteration=0, loss=0.2418648937437182
Current iteration=100, loss=0.13954185361936908
Current iteration=200, loss=0.13745173280410217
Current iteration=300, loss=0.13649793028963877
Current iteration=400, loss=0.13591410946029936
Current iteration=500, loss=0.13550021554716726
Current iteration=600, loss=0.1351840110930896
Current iteration=700, loss=0.134932765538862
Current

# Stochastic Gradient Descent

In [20]:
# Define a range of gamma values to test

gamma_values = [0.0015]

# Initialize variables to store the best parameters
best_gamma = None
best_threshold = None
best_f1 = 0
best_w = None

# Loop over gamma values
for gamma in gamma_values:
    print(f"Testing gamma: {gamma}")
    # Initialize weights
    initial_w = np.zeros(X_train.shape[1])
    # Train the model
    w, loss = mean_squared_error_sgd(y_train, X_train, initial_w, max_iters=1000, gamma=gamma)

    # Compute predicted scores on the validation set
    y_scores = X_test @ w

    # Optimize threshold for the current gamma
    threshold, f1, acc = optimize_threshold(y_test, y_scores)

    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}, Accuracy: {acc}")

    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_gamma = gamma
        best_threshold = threshold
        best_w = w.copy()

# After finding the best gamma and threshold, make predictions on the test set
y_scores = X_test @ best_w
y_prediction = np.where(y_scores >= best_threshold, 1, -1)

# Compute final accuracy and F1 score
final_f1_score = f1_score(y_test, y_prediction)

print("Best Gamma:", best_gamma)
print("Best Threshold:", best_threshold)
print("Final F1 Score:", final_f1_score)


Testing gamma: 0.0015
Current iteration=0, loss=0.4358293071877409
Current iteration=100, loss=0.1554699364256183
Current iteration=200, loss=0.15391516933068794
Current iteration=300, loss=0.14794901944581687
Current iteration=400, loss=0.14580763522809306
Current iteration=500, loss=0.14309168487530463
Current iteration=600, loss=0.14533765651766442
Current iteration=700, loss=0.14200531966951538
Current iteration=800, loss=0.1418814539905092
Current iteration=900, loss=0.14739712831977322
Gamma: 0.0015, Best Threshold: -0.6023182366929547, F1 Score: 0.3757095709570957, Accuracy: 0.8558824873908605


TypeError: 'numpy.float64' object is not callable

# Least Squares

In [7]:
w, loss = least_squares(y_train, X_train)

y_scores = X_test @ w

# Optimize threshold for the current gamma
threshold, f1, acc = optimize_threshold(y_test, y_scores)

# After finding the best gamma and threshold, make predictions on the test set
y_scores = X_test @ w
y_prediction = np.where(y_scores >= threshold, 1, -1)

# Compute final accuracy and F1 score
final_accuracy = accuracy(y_test, y_prediction)
final_f1_score = f1_score(y_test, y_prediction)

print("Best Threshold:", threshold)
print("Final Accuracy:", final_accuracy)
print("Final F1 Score:", final_f1_score)

  w=np.linalg.lstsq(A,b)[0]


Best Threshold: -0.5670693947456843
Final Accuracy: 0.8651926798421381
Final F1 Score: 0.41313432835820896


# Ridge Regression

In [9]:
# Define a range of lambda values to test
lambda_values = [0.005, 0.01 , 0.015, 0.02]

# Initialize variables to store the best parameters
best_lambda = None
best_threshold = None
best_f1 = -1
best_w = None

# Loop over lambda values
for lambda_ in lambda_values:
    print(f"Testing lambda: {lambda_}")
    # Train the model
    w, loss = ridge_regression(y_train, X_train, lambda_)
    
    # Compute predicted scores on the test set
    y_scores = X_test @ w  # Continuous scores
    
    # Optimize threshold for the current lambda
    threshold, f1, acc = optimize_threshold(y_test, y_scores)
    
    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}, Accuracy: {acc}")
    
    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_lambda = lambda_
        best_threshold = threshold
        best_w = w.copy()

# After finding the best lambda and threshold, make predictions on the test set
y_scores = X_test @ best_w
y_pred = np.where(y_scores >= best_threshold, 1, -1)

# Compute final F1 score
final_f1_score = f1_score(y_test, y_pred)

print("Best Lambda:", best_lambda)
print("Best Threshold:", best_threshold)
print("Final F1 Score:", final_f1_score)
print("Final Accuracy:", accuracy(y_test, y_pred))


Testing lambda: 0.005
Gamma: 0.01, Best Threshold: -0.5786245959336018, F1 Score: 0.4106614017769003, Accuracy: 0.8635470157099974
Testing lambda: 0.01
Gamma: 0.01, Best Threshold: -0.5782330978537572, F1 Score: 0.41038171971998416, Accuracy: 0.8639584317430326
Testing lambda: 0.015
Gamma: 0.01, Best Threshold: -0.5778854457415188, F1 Score: 0.41129673826571206, Accuracy: 0.864689838023984
Testing lambda: 0.02
Gamma: 0.01, Best Threshold: -0.5753453598351297, F1 Score: 0.41125251172136634, Accuracy: 0.866061224800768
Best Lambda: 0.015
Best Threshold: -0.5778854457415188
Final F1 Score: 0.41129673826571206
Final Accuracy: 0.864689838023984


# Logistic Regression

In [19]:
# Convert y_train and y_test to binary labels (0 and 1)
y_train_binary = (y_train + 1) // 2
y_test_binary = (y_test + 1) // 2

# Define a range of gamma values to test
gamma_values = [0.9]

# Initialize variables to store the best parameters
best_gamma = None
best_threshold = None
best_f1 = 0
best_w = None

# Loop over gamma values
for gamma in gamma_values:
    print(f"Testing gamma: {gamma}")
    # Initialize weights
    initial_w = np.zeros(X_train.shape[1])
    # Train the model
    w, loss = logistic_regression(y_train_binary, X_train, initial_w, max_iters=1000, gamma=gamma)
    
    # Compute predicted probabilities on the validation set
    y_scores = sigmoid(X_test @ w)
    
    # Optimize threshold for the current gamma
    threshold, f1, acc = optimize_threshold_logistic(y_test_binary, y_scores)
    
    print(f"Gamma: {gamma}, Best Threshold: {threshold}, F1 Score: {f1}, Accuracy: {acc}")
    
    # Update best parameters if current F1 is better
    if f1 > best_f1:
        best_f1 = f1
        best_gamma = gamma
        best_threshold = threshold
        best_w = w.copy()

# After finding the best gamma and threshold, make predictions on the test set
y_scores = sigmoid(X_test @ best_w)
y_prediction_binary = np.where(y_scores >= best_threshold, 1, 0)
y_prediction = y_prediction_binary * 2 - 1  # Convert back to -1 and 1

# Compute final accuracy and F1 score
final_f1_score = f1_score_logistic(y_test_binary, y_prediction_binary)

print("Best Gamma:", best_gamma)
print("Best Threshold:", best_threshold)
print("Final F1 Score:", final_f1_score)


Testing gamma: 0.9
Current iteration=0, loss=1.580226449126937
Current iteration=100, loss=0.32482471721749684
Current iteration=200, loss=0.3391803220447881
Current iteration=300, loss=0.2693260509636259
Current iteration=400, loss=0.2942305602379432
Current iteration=500, loss=0.2557766921885954
Current iteration=600, loss=0.28532196319066044
Current iteration=700, loss=0.2554614885016538
Current iteration=800, loss=0.2843560949045877
Current iteration=900, loss=0.25585764983762965
Gamma: 0.9, Best Threshold: 0.30303030303030304, F1 Score: 0.42230278784848113, Accuracy: 0.859173815655142


TypeError: 'numpy.float64' object is not callable

# Regularized Logistic Regression

In [18]:
# Convert y_train and y_test to binary labels (0 and 1)
y_train_binary = (y_train + 1) // 2
y_test_binary = (y_test + 1) // 2

# Define a range of gamma and lambda values to test
gamma_values = [0.9]
lambda_values = [0.0001]

# Initialize variables to store the best parameters
best_gamma = None
best_lambda = None
best_threshold = None
best_f1 = -1
best_w = None

# Loop over gamma and lambda values
for gamma in gamma_values:
    for lambda_ in lambda_values:
        print(f"Testing gamma: {gamma}, lambda: {lambda_}")
        # Initialize weights
        initial_w = np.zeros(X_train.shape[1])
        # Train the model
        w, loss = reg_logistic_regression(
            y_train_binary, X_train, lambda_, initial_w, max_iters=10, gamma=gamma
        )
        
        # Compute predicted probabilities on the test set
        y_scores = sigmoid(X_test @ w)
        
        # Optimize threshold for the current gamma and lambda
        threshold, f1, acc = optimize_threshold_logistic(y_test_binary, y_scores)
        
        print(f"Gamma: {gamma}, Lambda: {lambda_}, Best Threshold: {threshold}, F1 Score: {f1}, Accuracy: {acc}")
        
        # Update best parameters if current F1 is better
        if f1 > best_f1:
            best_f1 = f1
            best_gamma = gamma
            best_lambda = lambda_
            best_threshold = threshold
            best_w = w.copy()

# After finding the best gamma, lambda, and threshold, make predictions on the test set
y_scores = sigmoid(X_test @ best_w)
y_prediction_binary = np.where(y_scores >= best_threshold, 1, 0)
y_prediction = y_prediction_binary * 2 - 1  # Convert back to -1 and 1

# Compute final accuracy and F1 score
final_f1_score = f1_score_logistic(y_test_binary, y_prediction_binary)

print("Best Gamma:", best_gamma)
print("Best Lambda:", best_lambda)
print("Best Threshold:", best_threshold)
print("Final F1 Score:", final_f1_score)

Testing gamma: 0.9, lambda: 0.0001
Current iteration=0, loss=1.580926190366109 (with regularization)
Current iteration=100, loss=0.25310248661438456 (with regularization)
Current iteration=200, loss=0.3810539437599975 (with regularization)
Current iteration=300, loss=0.36750626878536896 (with regularization)
Current iteration=400, loss=0.38258974982361066 (with regularization)
Current iteration=500, loss=0.40461582835656396 (with regularization)


KeyboardInterrupt: 

In [10]:

# Convert y labels from {-1, 1} to binary {0, 1} for training and validation sets
y_train_binary = (y_train + 1) // 2
y_test_binary = (y_test + 1) // 2

# Initialize parameters for the model
best_lambda = 0.0001
best_gamma = 0.9                        # Learning rate for the logistic regression model
initial_w = np.zeros(X_train.shape[1])  # Initial weights set to zeros

# Train the logistic regression model
w, loss = reg_logistic_regression(y_train_binary, X_train, best_lambda, initial_w, max_iters=1000, gamma=best_gamma)

# Compute prediction scores for the test set
y_scores = sigmoid(x_test @ w)  # Apply the sigmoid function to compute probabilities

# Optimize the threshold to maximize F1 score and accuracy
#best_threshold, best_f1, best_accuracy = optimize_threshold_logistic(y_test_binary, y_scores)

# Generate binary predictions based on the best threshold
y_pred = np.where(y_scores >= 0.393939393939394, 1, -1)

# Output the performance results
#print("Threshold:", best_threshold)
#print("Final Accuracy:", best_accuracy)
#print("Final F1 Score:", best_f1)

# Create a CSV submission file with the predictions
create_csv_submission(test_ids, y_pred, "new_submission.csv")

Current iteration=0, loss=1.580926190366109 (with regularization)
Current iteration=100, loss=0.25310248661438456 (with regularization)
Current iteration=200, loss=0.3810539437599975 (with regularization)
Current iteration=300, loss=0.36750626878536896 (with regularization)
Current iteration=400, loss=0.38258974982361066 (with regularization)
Current iteration=500, loss=0.40461582835656396 (with regularization)
Current iteration=600, loss=0.3800112928687103 (with regularization)
Current iteration=700, loss=0.40224673244121306 (with regularization)
Current iteration=800, loss=0.3720865397839348 (with regularization)
Current iteration=900, loss=0.39900342153832924 (with regularization)
