# Model Training and Evaluation

# Model 1: Logistic Regression
- L2 Regularization (6 different penalty values).
- Model evaluation with training and validation accuracy.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

# -Model 1: Logistic Regression with 6 different hyperparameters (l2 penalty/ regularization)-
from sklearn.linear_model import LogisticRegression
l2_penalties = [0.001, 0.01, 0.1, 1, 10, 100]
l2_penalty_names = [f'coefficients [L2={l2_penalty:.0e}]'
                    for l2_penalty in l2_penalties]

accuracy_data_model1 = []

for l2_penalty, l2_penalty_column_name in zip(l2_penalties, l2_penalty_names):
  model_1 = LogisticRegression(penalty = 'l2', C = 1/l2_penalty, fit_intercept = False, max_iter = 10000)
  model_1.fit(X_train, y_train)

  train_accuracy = model_1.score(X_train, y_train)
  val_accuracy = model_1.score(X_val, y_val)

  accuracy_data_model1.append({
        'l2_penalty': l2_penalty,
        'train_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy,
    })

  y_pred_model_1 = model_1.predict(X_val)
  y_pred_class = y_pred_model_1

  accuracy = accuracy_score(y_val, y_pred_class)
  print(accuracy)
  print(f"Model 1 (Logistic Regression) with L2 penalty {l2_penalty}: Validation Accuracy = {val_accuracy}")

accuracies_table_1 = pd.DataFrame(accuracy_data_model1)


# -Checking Accuracies tables-
print(accuracies_table_1)
print(accuracies_table_2)

# -Plotting Training and Validation Accuracies-
plt.figure(figsize = (10,6))

plt.plot(accuracies_table_1['l2_penalty'], accuracies_table_1['train_accuracy'], color = 'red', label = 'Logistic Regression Train Error')
plt.plot(accuracies_table_1['l2_penalty'], accuracies_table_1['validation_accuracy'], color = 'blue', linestyle = '--', label = 'Logistic Regression Validation Error')

plt.plot(accuracies_table_2['n_neighbors'], accuracies_table_2['train_accuracy'], color = 'green', label = 'KNN Train Error')
plt.plot(accuracies_table_2['n_neighbors'], accuracies_table_2['validation_accuracy'], color = 'purple', linestyle = '--', label = 'KNN Validation Error')

plt.title('Model 1 (Logistic Regression) and Model 2 (K-Nearest Neighbors) Accuracy')
plt.xlabel('Hyperparameter')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.xlim(0,101)
plt.show()

# Model 2: K-Nearest Neighbors (KNN)
- Tried multiple neighbor values (5, 10, 15, etc.).
- Compared results and evaluated based on validation accuracy.

In [None]:
# -Model 2: K-Nearest Neighbors (KNN) with 2 Neighbors with 5 different hyperparameters (5 neighbors)-
from sklearn.neighbors import KNeighborsClassifier
n_neighbors = [5, 10, 15, 20, 25]
accuracy_data_model2 = []

for n_neighbor in n_neighbors:
  model_2 = KNeighborsClassifier(n_neighbors = n_neighbor)
  model_2.fit(X_train, y_train)

  train_accuracy = model_2.score(X_train, y_train)
  val_accuracy = model_2.score(X_val, y_val)

  accuracy_data_model2.append({
        'n_neighbors': n_neighbor,
        'train_accuracy': train_accuracy,
        'validation_accuracy': val_accuracy,
    })

  y_pred_model_2 = model_2.predict(X_val)
  y_pred_class = y_pred_model_2

  print(f"Model 2 (KNN) with {n_neighbor} neighbors: Validation Accuracy = {val_accuracy}")


  accuracy = accuracy_score(y_val, y_pred_class)
  print(accuracy)

accuracies_table_2 = pd.DataFrame(accuracy_data_model2)

# Model Selection
- Model 1: Logistic Regression selected based on highest validation accuracy.

In [None]:
# -Select the the Logistic Regression as the chosen best model due to validation accuracy-
# Make predictions based on the Logistic Regression model
best_model = max(accuracy_data_model1, key=lambda x: x['validation_accuracy'])
model_1 = LogisticRegression(penalty='l2', C=1/best_model['l2_penalty'], fit_intercept=False, max_iter=10000)
model_1.fit(X_train, y_train)


df_test = df_test.drop(columns=['userid_DI'])
df_test = imputer.transform(df_test)


predictions_lr = model_1.predict(df_test)
predictions_1 = [int(pred) for pred in predictions_lr]


# -Saving to submission.csv file-
df_test_submission = pd.read_csv('edx_test.csv')
to_save = pd.DataFrame()
to_save['userid_DI'] = df_test_submission['userid_DI']
to_save['certified'] = predictions_1
to_save.to_csv('submission.csv', index=False)
