# Naive Bayes (Trained On Human GossipCop)
https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [1]:
# IMPORTS
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score
import nltk
from sklearn.metrics import classification_report
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
from sklearn.feature_extraction.text import CountVectorizer


# SET SEED
np.random.seed(500)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jacobshort/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacobshort/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jacobshort/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jacobshort/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/jacobshort/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [8]:
# Load datasets
original_only_train_corpus = pd.read_csv("/Applications/AI/msc_project/data/my_gossipcop_train.csv")
original_only_validation_corpus = pd.read_csv("/Applications/AI/msc_project/data/my_gossipcop_validation.csv")
original_train_corpus = pd.concat([original_only_train_corpus, original_only_validation_corpus], ignore_index=True)

# Preprocess function
def preprocess(corpus): 
    tokenised_corpus = corpus.copy()

    # Lowercase all text
    tokenised_corpus['text'] = [entry.lower() for entry in tokenised_corpus['text']]

    # Split each text into words (word tokenisation)
    tokenised_corpus['text'] = [word_tokenize(entry) for entry in tokenised_corpus['text']]

    # Remove Stop words & Numeric words. Perform Word Stemming/Lemmatization
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    for index, entry in enumerate(tokenised_corpus['text']):
        Final_words = []
        word_Lemmatized = WordNetLemmatizer()
        for word, tag in pos_tag(entry):
            if word not in stopwords.words('english') and word.isalpha():
                word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
                Final_words.append(word_Final)
        tokenised_corpus.loc[index, 'text_final'] = ' '.join(Final_words)  # Convert list of words back to a single string

    X = tokenised_corpus['text_final']
    Y = tokenised_corpus['is_true']
        
    return X, Y

# Preprocess the training data
train_X, train_Y = preprocess(original_train_corpus)

# Train Naive Bayes Model
def train_nb(Train_X, Train_Y):
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(Train_X, Train_Y)
    return Naive

# Vectorize text data using CountVectorizer
Count_vect = CountVectorizer(max_features=5000)    
Count_vect.fit(train_X)
train_X_counts = Count_vect.transform(train_X)

model = train_nb(train_X_counts, train_Y)

# Prediction and metrics function for basic case
def predictions_and_metrics_basic(model, original_df, Count_vect):
    # Transform the test data
    test_X_counts = Count_vect.transform(original_df['text_final'])
    
    # Use the model to make predictions
    predictions_NB = model.predict(test_X_counts)

    results_df = pd.DataFrame({
        'Text': original_df['text'],
        'Predicted Label': predictions_NB,
        'Original Label': original_df['is_true']
    })

    original_misinformations = results_df[results_df['Original Label'] == 0]
    correct_predictions = original_misinformations[original_misinformations['Predicted Label'] == 0].shape[0]
    success_rate = 100*(correct_predictions / original_misinformations.shape[0]) if original_misinformations.shape[0] > 0 else 0

    metrics = classification_report(results_df['Original Label'], results_df['Predicted Label'])

    return results_df, success_rate, metrics

# Prediction and metrics function for classwise case
def predictions_and_metrics_classwise(model, original_df, test_X_counts):
    # Use the model to make predictions directly on the provided count matrix
    predictions_NB = model.predict(test_X_counts)

    results_df = pd.DataFrame({
        'Text': original_df['text'],
        'Category': original_df['label'],
        'Predicted Label': predictions_NB,
        'Original Label': original_df['is_true']
    })

    # Compute overall success rate
    original_misinformations = results_df[results_df['Original Label'] == 0]
    correct_predictions = original_misinformations[original_misinformations['Predicted Label'] == 0].shape[0]
    success_rate = 100 * (correct_predictions / original_misinformations.shape[0]) if original_misinformations.shape[0] > 0 else 0

    # Compute class-wise success rates
    classwise_success_rates = {}
    for category in results_df['Category'].unique():
        category_df = results_df[results_df['Category'] == category]
        original_misinformations = category_df[category_df['Original Label'] == 0]
        correct_predictions = original_misinformations[original_misinformations['Predicted Label'] == 0].shape[0]
        classwise_success_rate = 100 * (correct_predictions / original_misinformations.shape[0]) if original_misinformations.shape[0] > 0 else 0
        classwise_success_rates[category] = classwise_success_rate

    metrics = classification_report(results_df['Original Label'], results_df['Predicted Label'])
    
    return results_df, success_rate, classwise_success_rates, metrics


# HUMAN TEST SET
original_human_test_corpus = pd.read_csv("/Applications/AI/msc_project/data/my_gossipcop_test.csv")
test_X, test_Y = preprocess(original_human_test_corpus)

# Add 'text_final' to original_human_test_corpus
original_human_test_corpus['text_final'] = test_X

# Generate predictions and metrics
results_df, success_rate, metrics = predictions_and_metrics_basic(model, original_human_test_corpus, Count_vect)
print(success_rate)
print(metrics)
display(results_df)


74.92163009404389
              precision    recall  f1-score   support

           0       0.55      0.75      0.64       957
           1       0.91      0.81      0.86      3045

    accuracy                           0.80      4002
   macro avg       0.73      0.78      0.75      4002
weighted avg       0.83      0.80      0.80      4002



Unnamed: 0,Text,Predicted Label,Original Label
0,HollywoodLife is turning Robert Pattinson ’s n...,0,0
1,In its quest to launch a hit fantasy series of...,1,1
2,The budding romance between Josh Duhamel and E...,0,1
3,"She may not be at the Super Bowl, but she cert...",1,1
4,"Do not be tardy for this dance, Friday night ...",1,1
...,...,...,...
3997,Ever wonder what Hollywood stars are whisperin...,1,1
3998,Julia Louis-Dreyfus just wrapped her second ro...,1,1
3999,She’s had enough! Cheryl Cole shut down a repo...,0,1
4000,It's Official! The Bachelor's Arie Luyendyk Jr...,1,1


In [9]:
# LLM TEST SET
original_llm_test_corpus = pd.read_csv("/Applications/AI/msc_project/data/my_llm_fake_gossipcop_test.csv")
test_X, test_Y = preprocess(original_llm_test_corpus)

# Add 'text_final' to original_llm_test_corpus
original_llm_test_corpus['text_final'] = test_X

# Transform the test data using CountVectorizer
test_X_counts = Count_vect.transform(test_X)

# Generate predictions and metrics for classwise evaluation
results_df, success_rate, classwise_success_rates, metrics = predictions_and_metrics_classwise(model, original_llm_test_corpus, test_X_counts)
print(success_rate)
print(metrics)
print(classwise_success_rates)


72.66187050359713
              precision    recall  f1-score   support

           0       1.00      0.73      0.84       139
           1       0.00      0.00      0.00         0

    accuracy                           0.73       139
   macro avg       0.50      0.36      0.42       139
weighted avg       1.00      0.73      0.84       139

{'llm_paraphrase': 73.91304347826086, 'llm_rewritten': 76.59574468085107, 'llm_open_generation': 67.3913043478261}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
