# Prediction Model

## Importing libraries and files

In [1]:
import pandas as pd
import numpy as np
import math
from os.path import join
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from IPython.display import display, Markdown
from random import randint


base_dataset = pd.read_csv(join("data", "dataset.csv"))

## Creating datasets

In [2]:
raw_x_train, raw_x_test, y_train, y_test = train_test_split(
    base_dataset["Title"],
    base_dataset["Fact Check"],
    test_size=0.2
)

print(f"Train sample size: {len(raw_x_train)}")
print(f"Test sample size: {len(raw_x_test)}")

print(raw_x_train.head(1))
print(raw_x_test.head(1))

Train sample size: 4687
Test sample size: 1172
2195    Did Trump Write 'Joe, You Know I Won' in Lette...
Name: Title, dtype: object
838    No, This Is Not a Real Photo of Trump Being Ar...
Name: Title, dtype: object


## Formatting data

In [3]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(raw_x_train)
x_test = vectorizer.transform(raw_x_test)

display(x_train)

<4687x8772 sparse matrix of type '<class 'numpy.float64'>'
	with 41987 stored elements in Compressed Sparse Row format>

#  Train Naive Bayes

In [5]:
gnb = GaussianNB()
gnb.fit(x_train.toarray(), y_train)

y_pred = gnb.predict(x_test.toarray())

preds = list(zip(raw_x_test, y_pred))
diffs = list(zip(y_test, y_pred))
for pred, diff in list(zip(preds, diffs))[:10]:
    print(pred, diff)

('No, This Is Not a Real Photo of Trump Being Arrested', True) (False, True)
('Will US Passport Holders Be Required to Get a Visa to Enter EU in 2024?', False) (False, False)
("Dick Smith Statement About Halal Certifications as 'Extortion'", False) (False, False)
("Does Photo Show 'The View' Host Joy Behar in Blackface?", False) (True, False)
('Is This a Photograph of Biden with Tara Reade?', False) (False, False)
("Does This Photograph Show a Workmen's Van Trapped by Bollards?", True) (True, True)
('Did Pat Robertson Blame Oral Sex for COVID-19?', False) (False, False)
("No, Everyone Is Not Winning This 'Disney World Holiday Box'", False) (True, False)
('No, Hunter Biden Did Not Die From a Drug Overdose', True) (False, True)
("Did Bill Gates' Father 'Run' Planned Parenthood and Teach His Son to Spread the 'Gospel' of Eugenics?", False) (False, False)


## Metrics

In [6]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in diffs:
    real = "1" if i[0] else "0"
    predicted = "1" if i[1] else "0"
    matrix = f"{real}{predicted}"
    match matrix:
        case "11":
            tp += 1
        case "01":
            fp += 1
        case "10":
            fn += 1
        case "00":
            tn += 1

matrix = ((tp, fn), (fp, tn))
markdownDataframe = pd.DataFrame(matrix, index=["Real - Verdadeiro", "Real - Falso"], columns=["Predito - Verdadeiro", "Predito - Falso"])

display(Markdown(markdownDataframe.to_markdown(index=True)))

accuracy = (tp+tn)/len(diffs)
specificity = tn/(fp+tn)
recall = tp/(tp+fn)
precision = tp/(tp+fp)
f_score = 2 * ((precision * recall) / (precision + recall))

print(f"Acuracia = {round(accuracy * 100,2)}%")
print(f"Especificidade = {round(specificity * 100, 2)}%")
print(f"Sensibilidade = {round(recall * 100, 2)}%")
print(f"Precisão = {round(precision * 100, 2)}%")

print(f"Tamanho do conjunto de teste: {tp+tn+fp+fn}")
# accuracy = accuracy_score(y_test, y_pred)
# cm = confusion_matrix(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print(f'Accuracy: {accuracy}')
# print(f'Confusion Matrix: {cm}')
# print(f'Classification Report: {report}')

|                   |   Predito - Verdadeiro |   Predito - Falso |
|:------------------|-----------------------:|------------------:|
| Real - Verdadeiro |                    214 |               183 |
| Real - Falso      |                    339 |               436 |

Acuracia = 55.46%
Especificidade = 56.26%
Sensibilidade = 53.9%
Precisão = 38.7%
Tamanho do conjunto de teste: 1172


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6c691cc8-2b4f-40d4-afcd-7c91cbe8e76d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>