## Logistic Ridge Regression Implementation

Naive implementation of a logistic regression to compare to more advanced model approaches

In [48]:
from joblib import load
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split

# Load the saved TF-IDF feature matrix and target variable for the training data
X_train_tfidf_loaded = load('X_train_tfidf.joblib')
y_train_tfidf_loaded = load('y_train_tfidf.joblib')
X_test_tfidf_loaded = load('X_test_tfidf.joblib')

# Load the saved Count Vectorizer feature matrix and target varible for the training data
X_train_count_loaded = load('X_train_count.joblib')
y_train_count_loaded = load('y_train_count.joblib')
X_test_count_loaded = load('X_test_count.joblib')

Now, handling only the TF-IDF Vectorized data, split the X_train into training and validation.

In [49]:
# Split the training data into training and validation sets
X_train_tfidf, X_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(X_train_tfidf_loaded, y_train_tfidf_loaded, test_size=0.2, random_state=42)

In [50]:
# Fit model with `X_train` and `y_train`
ridge_logistic_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_logistic_model.fit(X_train_tfidf, y_train_tfidf)

# Evaluate on the validation set
y_val_pred = ridge_logistic_model.predict(X_val_tfidf)

# Evaluate the model performance
val_accuracy = accuracy_score(y_val_tfidf, y_val_pred)
val_f1_score = f1_score(y_val_tfidf, y_val_pred, average='weighted')  # Use 'macro' or 'micro' based on your needs
val_report = classification_report(y_val_tfidf, y_val_pred)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation F1 Score: {val_f1_score}')
print('Validation Classification Report:')
print(val_report)

Validation Accuracy: 0.7968232958305758
Validation F1 Score: 0.7926456423688162
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.89      0.84       874
           1       0.82      0.66      0.73       637

    accuracy                           0.80      1511
   macro avg       0.80      0.78      0.78      1511
weighted avg       0.80      0.80      0.79      1511



Now, handling only the Count Vectorized data, split the X_train into training and validation.

In [51]:
# Split the training data into training and validation sets
X_train_count, X_val_count, y_train_count, y_val_count = train_test_split(X_train_count_loaded, y_train_count_loaded, test_size=0.2, random_state=42)

In [52]:
# Fit model with `X_train` and `y_train`
ridge_logistic_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_logistic_model.fit(X_train_count, y_train_count)

# Evaluate on the validation set
y_val_pred_count = ridge_logistic_model.predict(X_val_count)

# Evaluate the model performance
val_accuracy = accuracy_score(y_val_count, y_val_pred_count)
val_f1_score = f1_score(y_val_count, y_val_pred_count, average='weighted')  # Use 'macro' or 'micro' based on your needs
val_report = classification_report(y_val_count, y_val_pred_count)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation F1 Score: {val_f1_score}')
print('Validation Classification Report:')
print(val_report)

Validation Accuracy: 0.785572468563865
Validation F1 Score: 0.7839712435285466
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       874
           1       0.77      0.70      0.73       637

    accuracy                           0.79      1511
   macro avg       0.78      0.77      0.78      1511
weighted avg       0.78      0.79      0.78      1511

