In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Read the data
data = pd.read_csv('politifact_data_combined.csv')
data = data.query("label not in ['full-flop', 'half-flip', 'no-flip']")

# Binary classification for 'pants-fire'
X_pants_fire = data[['content', 'article']]
y_pants_fire = (data['label'] == 'pants-fire').astype(int)

X_train_pf, X_test_pf, y_train_pf, y_test_pf = train_test_split(X_pants_fire, y_pants_fire, test_size=0.2, random_state=42)

# Vectorizing the text data without n-grams
tfidf_pf = TfidfVectorizer(stop_words='english')
X_train_tfidf_pf = tfidf_pf.fit_transform(X_train_pf['content'] + " " + X_train_pf['article'])
X_test_tfidf_pf = tfidf_pf.transform(X_test_pf['content'] + " " + X_test_pf['article'])

# Binary classification model for 'pants-fire'
classifier_pf = LogisticRegression(max_iter=2500)
classifier_pf.fit(X_train_tfidf_pf, y_train_pf)

y_pred_pf = classifier_pf.predict(X_test_tfidf_pf)

accuracy_pf = accuracy_score(y_test_pf, y_pred_pf)
print(f"Pants-fire Binary Classification Accuracy: {accuracy_pf}\n")

print("Pants-fire Binary Classification Report:\n")
print(classification_report(y_test_pf, y_pred_pf))

# Multi-class classification for the rest of the labels
X_multi = data[data['label'] != 'pants-fire'][['content', 'article']]
y_multi = data[data['label'] != 'pants-fire']['label']

X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)

# Vectorizing the text data with n-grams
tfidf_multi = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_tfidf_multi = tfidf_multi.fit_transform(X_train_multi['content'] + " " + X_train_multi['article'])
X_test_tfidf_multi = tfidf_multi.transform(X_test_multi['content'] + " " + X_test_multi['article'])

# Multi-class classification model
classifier_multi = LogisticRegression(max_iter=2500)
classifier_multi.fit(X_train_tfidf_multi, y_train_multi)

y_pred_multi = classifier_multi.predict(X_test_tfidf_multi)

accuracy_multi = accuracy_score(y_test_multi, y_pred_multi)
print(f"Multi-Class Classification Accuracy: {accuracy_multi}\n")

print("Multi-Class Classification Report:\n")
print(classification_report(y_test_multi, y_pred_multi))

# Convert binary labels to strings before concatenating
y_test_pf_str = y_test_pf.astype(str)
y_pred_pf_str = pd.Series(y_pred_pf, index=y_test_pf.index).astype(str)

# Combine predictions for the overall accuracy
y_pred_combined = pd.concat([y_pred_pf_str, pd.Series(y_pred_multi, index=X_test_multi.index)])
y_true_combined = pd.concat([y_test_pf_str, pd.Series(y_test_multi, index=X_test_multi.index)])

# Compute overall accuracy
accuracy_combined = accuracy_score(y_true_combined, y_pred_combined)
print(f"Overall Accuracy: {accuracy_combined}\n")

Pants-fire Binary Classification Accuracy: 0.9116251482799526

Pants-fire Binary Classification Report:

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      4385
           1       0.93      0.36      0.52       673

    accuracy                           0.91      5058
   macro avg       0.92      0.68      0.74      5058
weighted avg       0.91      0.91      0.89      5058

