# Prediction Model

## Importing libraries and files

In [12]:
import pandas as pd
import numpy as np
import nltk
from os.path import join
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from IPython.display import display, Markdown
from random import randint

# Downloading NLTK data to use later
nltk.download('stopwords')

base_dataset = pd.read_csv(join("data", "dataset.csv"))

TITLE = "Title"
F_CHECK = "Fact Check"

[nltk_data] Downloading package stopwords to /home/gustav-
[nltk_data]     campos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Processing Dataset

In [39]:
def clean_text(text, stop_words):
    processed_text = "".join(ch for ch in text if ch.isalnum() or ch.isspace()).lower()
    filtered_words = [word for word in processed_text.split(" ") if word not in stop_words]
    return " ".join(filtered_words)

stop_words = set(nltk.corpus.stopwords.words('english'))

processed_dataset = base_dataset[[TITLE, F_CHECK]].copy(deep=True)

processed_dataset[TITLE] = processed_dataset[TITLE].apply(
    lambda n: clean_text(n, stop_words)
)

display(Markdown(base_dataset[[TITLE, F_CHECK]].head(5).to_markdown(index=False)))
display(Markdown(processed_dataset.head(5).to_markdown(index=False)))

| Title                                                               | Fact Check   |
|:--------------------------------------------------------------------|:-------------|
| The Story of Disney Deleting 1-800-SPANK-ME from 'The Santa Clause' | True         |
| Yes, LA County Granted a Young Girl's Request for a Unicorn License | True         |
| Are More People Killed By Donkeys Than Airplane Crashes Annually?   | False        |
| Can Poll Workers Invalidate Ballots by Writing on Them?             | False        |
| Did Trump Decry Low-Flow Showers and Dishwashers During a Pandemic? | True         |

| Title                                                     | Fact Check   |
|:----------------------------------------------------------|:-------------|
| story disney deleting 1800spankme santa clause            | True         |
| yes la county granted young girls request unicorn license | True         |
| people killed donkeys airplane crashes annually           | False        |
| poll workers invalidate ballots writing                   | False        |
| trump decry lowflow showers dishwashers pandemic          | True         |

## Creating datasets

In [58]:
raw_x_train, raw_x_test, y_train, y_test = train_test_split(
    processed_dataset[TITLE],
    processed_dataset[F_CHECK],
    test_size=0.5
)

print(f"Train sample size: {len(raw_x_train)}")
print(f"Test sample size: {len(raw_x_test)}")

print(raw_x_train.head(1))
print(raw_x_test.head(1))

Train sample size: 4101
Test sample size: 1758
3942    hotel owner switzerland ask jewish guests show...
Name: Title, dtype: object
1934    album called laid lord band called golden egg
Name: Title, dtype: object


## Formatting data

In [59]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(raw_x_train)
x_test = vectorizer.transform(raw_x_test)

display(x_train)

<4101x8376 sparse matrix of type '<class 'numpy.float64'>'
	with 25914 stored elements in Compressed Sparse Row format>

#  Train Naive Bayes

In [60]:
gnb = GaussianNB()
gnb.fit(x_train.toarray(), y_train)

y_pred = gnb.predict(x_test.toarray())

preds = list(zip(raw_x_test, y_pred))
diffs = list(zip(y_test, y_pred))
for pred, diff in list(zip(preds, diffs))[:10]:
    print(pred, diff)

('album called laid lord band called golden egg', False) (False, False)
('dont bite shark tank weight loss scam promising 50lbs 61 days', True) (True, True)
('hazmat suits needed install 5g cellphone towers', False) (False, False)
('santas canadian postal code', False) (True, False)
('venezuelan poodle moth real', True) (False, True)
('starbucks going cashless uk us canada', False) (False, False)
('full transcript call flight 93 911', True) (False, True)
('bill nye say gender determined chromosomes', False) (False, False)
('photograph playground 1900', True) (True, True)
('picture gay lions mating', True) (True, True)


## Metrics

In [61]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in diffs:
    real = "1" if i[0] else "0"
    predicted = "1" if i[1] else "0"
    matrix = f"{real}{predicted}"
    match matrix:
        case "11":
            tp += 1
        case "01":
            fp += 1
        case "10":
            fn += 1
        case "00":
            tn += 1

matrix = ((tp, fn), (fp, tn))
markdownDataframe = pd.DataFrame(matrix, index=["Real - Verdadeiro", "Real - Falso"], columns=["Predito - Verdadeiro", "Predito - Falso"])

display(Markdown(markdownDataframe.to_markdown(index=True)))

accuracy = (tp+tn)/len(diffs)
specificity = tn/(fp+tn)
recall = tp/(tp+fn)
precision = tp/(tp+fp)
f_score = 2 * ((precision * recall) / (precision + recall))

print(f"Acuracia = {round(accuracy * 100,2)}%")
print(f"Especificidade = {round(specificity * 100, 2)}%")
print(f"Sensibilidade = {round(recall * 100, 2)}%")
print(f"Precisão = {round(precision * 100, 2)}%")

print(f"Tamanho do conjunto de teste: {tp+tn+fp+fn}")
# accuracy = accuracy_score(y_test, y_pred)
# cm = confusion_matrix(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print(f'Accuracy: {accuracy}')
# print(f'Confusion Matrix: {cm}')
# print(f'Classification Report: {report}')

|                   |   Predito - Verdadeiro |   Predito - Falso |
|:------------------|-----------------------:|------------------:|
| Real - Verdadeiro |                    355 |               236 |
| Real - Falso      |                    506 |               661 |

Acuracia = 57.79%
Especificidade = 56.64%
Sensibilidade = 60.07%
Precisão = 41.23%
Tamanho do conjunto de teste: 1758


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6c691cc8-2b4f-40d4-afcd-7c91cbe8e76d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>