# Prediction Model

## Importing libraries and files

In [2]:
import pandas as pd
import numpy as np
import math
from os.path import join
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from IPython.display import display, Markdown
from random import randint


base_dataset = pd.read_csv(join("data", "dataset.csv"))

## Creating datasets

In [3]:
raw_x_train, raw_x_test, y_train, y_test = train_test_split(
    base_dataset["Title"],
    base_dataset["Fact Check"],
    test_size=0.2
)

print(f"Train sample size: {len(raw_x_train)}")
print(f"Test sample size: {len(raw_x_test)}")

print(raw_x_train.head(1))
print(raw_x_test.head(1))

Train sample size: 4687
Test sample size: 1172
3400    Do These Photographs Show a Homeless Veteran D...
Name: Title, dtype: object
1696    No, This Isn't a Real Headline About Cuba and ...
Name: Title, dtype: object


## Formatting data

In [4]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(raw_x_train)
x_test = vectorizer.transform(raw_x_test)

display(x_train)

<4687x8756 sparse matrix of type '<class 'numpy.float64'>'
	with 41903 stored elements in Compressed Sparse Row format>

#  Train Naive Bayes

In [5]:
gnb = GaussianNB()
gnb.fit(x_train.toarray(), y_train)

y_pred = gnb.predict(x_test.toarray())

preds = list(zip(raw_x_test, y_pred))
diffs = list(zip(y_test, y_pred))
for pred, diff in zip(preds, diffs):
    print(pred, diff)

("No, This Isn't a Real Headline About Cuba and Woodchippers", True) (False, True)
("Are Human Fetuses 'Taiwan's Hottest Dish'?", False) (True, False)
('Christopher John Mineo, Jr', True) (False, True)
("Sylvester Stallone Turned Down Work with Robert De Niro Over His 'Wokeness'?", False) (False, False)
('The Beatles Posed with Decapitated Baby Dolls in Real Album Cover Photo?', True) (True, True)
('Pig Fish', True) (False, True)
('Is This a Hedgehog Skeleton?', True) (True, True)
('Did Malia Obama Cause 24 Classmates to Be Expelled for Praying on Campus?', False) (False, False)
('Is There a Dedicated Hall & Oates Phone Line?', False) (True, False)
("Did MyPillow CEO Mike Lindell Say His Social Network Would Ban 'Taking God's Name in Vain'?", False) (True, False)
("Did Trump Say 'Laziness Is a Trait in Blacks; No Black President Again Any Time Soon'?", False) (False, False)
('Did First COVID-19 Vaccine Recipient Suspiciously Appear in News Report Months Earlier?', False) (False, False)

## Metrics

In [6]:
# tp = 0
# tn = 0
# fp = 0
# fn = 0

# for i in diffs:
#     matrix = f"{i[0]}{i[1]}"
#     match matrix:
#         case "11":
#             tp += 1
#         case "01":
#             fp += 1
#         case "10":
#             fn += 1
#         case "00":
#             tn += 1

# accuracy = (tp+tn)/len(diffs)
# specificity = tn/(fp+tn)
# recall = tp/(tp+fn)
# precision = tp/(tp+fp)
# f_score = 2 * ((precision * recall) / (precision + recall))

accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix: {cm}')
print(f'Classification Report: {report}')

# print(f"Acuracia = {accuracy}")
# print(f"Especificidade = {specificity}")
# print(f"Sensibilidade = {recall}")
# print(f"Precisão = {precision}")

Accuracy: 0.5699658703071673
Confusion Matrix: [[414 322]
 [182 254]]
Classification Report:               precision    recall  f1-score   support

       False       0.69      0.56      0.62       736
        True       0.44      0.58      0.50       436

    accuracy                           0.57      1172
   macro avg       0.57      0.57      0.56      1172
weighted avg       0.60      0.57      0.58      1172



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6c691cc8-2b4f-40d4-afcd-7c91cbe8e76d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>