In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import pickle

# Task 1 2 3 (load)

In [6]:
with open('tfidf_train_vectors_with_labels.pickle', 'rb') as f:
    tfidf_train_vectors, train_labels = pickle.load(f)

with open('tfidf_val_vectors_with_labels.pickle', 'rb') as f:
    tfidf_val_vectors, val_labels = pickle.load(f)

# Task 4

In [7]:
from sklearn.linear_model import LogisticRegression

# Initialize and train Logistic Regression model
logreg_model = LogisticRegression(random_state=32)
logreg_model.fit(tfidf_train_vectors, train_labels)

# Predictions on validation and test datasets
val_predictions_logreg = logreg_model.predict(tfidf_val_vectors)
# test_predictions_logreg = logreg_model.predict(tfidf_test_vectors)

# Evaluation
val_accuracy_logreg = accuracy_score(val_labels, val_predictions_logreg)
print("Validation Accuracy (Logistic Regression):", val_accuracy_logreg)

# test_accuracy_logreg = accuracy_score(test_df['label'], test_predictions_logreg)
# print("Test Accuracy (Logistic Regression):", test_accuracy_logreg)


Validation Accuracy (Logistic Regression): 0.9339622641509434


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Assuming tfidf_train_vectors and train_df['label'] are your TF-IDF vectors and labels for training
parameters_logreg = {
    'C': [0.001, 0.01, 0.1, 1],  # Regularization parameter
    'penalty': ['l1', 'l2'],  # Regularization penalty
    'solver': ['liblinear', 'saga'],  # Solver for optimization problem
}

logreg_model = LogisticRegression(random_state=42)

grid_search_logreg = GridSearchCV(logreg_model, parameters_logreg, cv=5, scoring='accuracy')
grid_search_logreg.fit(tfidf_train_vectors, train_labels)  # Fit GridSearchCV to the training data

# Get the best hyperparameters
best_hyperparameters_logreg = grid_search_logreg.best_params_
best_model = grid_search_logreg.best_estimator_  # Get the best model
print("Best Hyperparameters (Logistic Regression):", best_hyperparameters_logreg)




Best Hyperparameters (Logistic Regression): {'C': 1, 'penalty': 'l2', 'solver': 'saga'}


In [9]:

val_predictions_log = best_model.predict(tfidf_val_vectors)
# Evaluation
val_accuracy_log = accuracy_score(val_labels, val_predictions_log)
print("Validation Accuracy (Logistic Regression):", val_accuracy_log)

Validation Accuracy (Logistic Regression): 0.9339622641509434


In [10]:
# save model
filename = "log_model.pickle"
pickle.dump(best_model, open(filename, "wb"))

# Task 5

In [11]:
# load model
loaded_model = pickle.load(open(filename, "rb"))

In [13]:
with open('tfidf_test_vectors_with_labels.pickle', 'rb') as f:
    tfidf_test_vectors, test_labels = pickle.load(f)

In [15]:
test_predictions = loaded_model.predict(tfidf_test_vectors)

y_true = test_labels
conf_matrix = confusion_matrix(y_true, test_predictions)
accuracy = accuracy_score(y_true, test_predictions)
precision = precision_score(y_true, test_predictions, pos_label=1)  # Assuming 'real' is the positive class
recall = recall_score(y_true, test_predictions, pos_label=1)  # Assuming 'real' is the positive class
f1 = f1_score(y_true, test_predictions, pos_label=1)  # Assuming 'real' is the positive class

print("Confusion Matrix:")
print(conf_matrix)
print(f'Accuracy: {accuracy:.5f}')
print(f'Precision: {precision:.5f}')
print(f'Recall: {recall:.5f}')
print(f'F1-score: {f1:.5f}')


Confusion Matrix:
[[452  44]
 [ 31 533]]
Accuracy: 0.92925
Precision: 0.92374
Recall: 0.94504
F1-score: 0.93427


In [16]:
from sklearn.metrics import classification_report
print(classification_report(y_true, test_predictions))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92       496
           1       0.92      0.95      0.93       564

    accuracy                           0.93      1060
   macro avg       0.93      0.93      0.93      1060
weighted avg       0.93      0.93      0.93      1060

