## Logistic Ridge Regression Implementation

Naive implementation of a logistic regression to compare to more advanced model approaches

In [1]:
from joblib import load
#from pickle import dump, load
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd


# Load the saved TF-IDF feature matrix and target variable for the training data
model_train_tfidf = load('../pkl_files/model_train_tfidf.pkl')
target = load('../pkl_files/target.pkl')
model_test_tfidf = load('../pkl_files/model_test_tfidf.pkl')

# Load the saved Count Vectorizer feature matrix and target varible for the training data
model_train_count = load('../pkl_files/model_train_count.pkl')
target = load('../pkl_files/target.pkl')
model_test_count = load('../pkl_files/model_test_count.pkl')


Now, handling only the TF-IDF Vectorized data, split the X_train into training and validation.

In [2]:
# Split the training data into training and validation sets
X_validation_train_tfidf, X_validation_test_tfidf, y_validation_train_tfidf, y_validation_test_tfidf = train_test_split(model_train_tfidf, target, test_size=0.2, random_state=42)

Implement a logistic ridge regression model using the TF-IDF Vectorized data

In [None]:
# Fit model with `X_train` and `y_train`
ridge_logistic_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_logistic_model.fit(X_validation_train_tfidf, y_validation_train_tfidf)

# Evaluate on the validation set
y_val_pred_tfidf = ridge_logistic_model.predict(X_validation_test_tfidf)

# Evaluate the model performance
val_accuracy = accuracy_score(y_validation_test_tfidf, y_val_pred_tfidf)
val_f1_score = f1_score(y_validation_test_tfidf, y_val_pred_tfidf, average='weighted')  
val_report = classification_report(y_validation_test_tfidf, y_val_pred_tfidf)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation F1 Score: {val_f1_score}')
print('Validation Classification Report:')
print(val_report)

Validation Accuracy: 0.768220617202889
Validation F1 Score: 0.7675215881936576
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80       874
           1       0.74      0.71      0.72       649

    accuracy                           0.77      1523
   macro avg       0.76      0.76      0.76      1523
weighted avg       0.77      0.77      0.77      1523



Fit model using entire TF-IDF training set to predict testing target values

In [4]:
# Fit model again using 
ridge_logistic_model.fit(model_train_tfidf, target)

# Evaluate on the validation set
y_ridge_log_pred_tfidf = ridge_logistic_model.predict(model_test_tfidf)

# read in submission CSV
tfidf_ridge_log_reg_submission = pd.read_csv("../Data/sample_submission.csv")

# replace empty target column with predicted values
tfidf_ridge_log_reg_submission['target'] = y_ridge_log_pred_tfidf

# Save results for submission
tfidf_ridge_log_reg_submission.to_csv('../csv_files/tfidf_ridge_log_reg_submission.csv', index=False)

Now, handling only the Count Vectorized data, split the X_train into training and validation.

In [5]:
# Split the training data into training and validation sets
X_validation_train_count, X_validation_test_count, y_validation_train_count, y_validation_test_count = train_test_split(model_train_count, target, test_size=0.2, random_state=42)

Implement a logistic ridge regression model using the Count Vectorized data

In [6]:
# Fit model with `X_train` and `y_train`
ridge_logistic_model = LogisticRegression(penalty='l2', solver='liblinear')
ridge_logistic_model.fit(X_validation_train_count, y_validation_train_count)

# Evaluate on the validation set
y_val_pred_count = ridge_logistic_model.predict(X_validation_test_count)

# Evaluate the model performance
val_accuracy = accuracy_score(y_validation_test_count, y_val_pred_count)
val_f1_score = f1_score(y_validation_test_count, y_val_pred_count, average='weighted')  # Use 'macro' or 'micro' based on your needs
val_report = classification_report(y_validation_test_count, y_val_pred_count)

print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation F1 Score: {val_f1_score}')
print('Validation Classification Report:')
print(val_report)

Validation Accuracy: 0.7767564018384767
Validation F1 Score: 0.7756428847274106
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       874
           1       0.75      0.71      0.73       649

    accuracy                           0.78      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



Fit model using entire CountVectorizer training set to predict testing target values

In [7]:
# Fit model again using 
ridge_logistic_model.fit(model_train_count, target)

# Evaluate on the validation set
y_ridge_log_pred_count = ridge_logistic_model.predict(model_test_count)

# read in submission CSV
count_ridge_log_reg_submission = pd.read_csv("../Data/sample_submission.csv")

# replace empty target column with predicted values
count_ridge_log_reg_submission['target'] = y_ridge_log_pred_count

# Save results for submission
count_ridge_log_reg_submission.to_csv('../csv_files/count_ridge_log_reg_submission.csv', index=False)