# Prediction Model

## Importing libraries and files

In [9]:
import pandas as pd
import numpy as np
import nltk
import joblib
from os.path import join
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from IPython.display import display, Markdown
from random import randint

# Downloading NLTK data to use later
nltk.download('stopwords')

base_dataset = pd.read_csv(join("data", "dataset.csv"))

TITLE = "Title"
F_CHECK = "Fact Check"

display(Markdown(base_dataset.sample(5).to_markdown()))

[nltk_data] Downloading package stopwords to /home/gustav-
[nltk_data]     campos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


|      | Title                                                          | Fact Check   | Byline                                                                                                                                       | Date           | Author                 | Link                                                                   |
|-----:|:---------------------------------------------------------------|:-------------|:---------------------------------------------------------------------------------------------------------------------------------------------|:---------------|:-----------------------|:-----------------------------------------------------------------------|
| 5080 | 'Free Disney Theme Park Tickets' Scam                          | True         | Disney is not giving away free theme park tickets to those who like and share posts on Facebook; such offers are a form of sweepstakes scam. | April 30, 2012 | David Mikkelson        | https://www.snopes.com/fact-check/free-disney-theme-park-tickets-scam/ |
|  437 | Does This Video Show a Film Crew Staging Footage in Gaza?      | True         | “Oh God, have mercy on us,” the caption said for one Instagram post that spread the claim.                                                   | Nov. 10, 2023  | Izz Scott LaMagdeleine | https://www.snopes.com/fact-check/film-crew-footage-gaza/              |
| 4573 | Jack Black Death Hoax                                          | False        | A rumor circulated that Jack Black had suddenly died in June 2016 after the actor's Twitter account was hacked.                              | June 5, 2016   | Dan Evon               | https://www.snopes.com/fact-check/jack-black-death-hoax/               |
| 3086 | Was 'Trump' Uttered 1,326 Times During the Democratic Debates? | False        | If numbers matter, this meme was only off by a thousand-plus.                                                                                | July 1, 2019   | Dan Evon               | https://www.snopes.com/fact-check/trump-said-1326-times-debates/       |
| 5015 | Cheerleader Poops at Football Game?                            | False        | This cheerleader was unlucky only in becoming the subject of a viral meme due to a Reddit competition.                                       | Oct. 6, 2014   | Snopes Staff           | https://www.snopes.com/fact-check/cheerleader-accident/                |

# Processing Dataset

In [10]:
def clean_text(text, stop_words):
    processed_text = "".join(ch for ch in text if ch.isalnum() or ch.isspace()).lower()
    filtered_words = [word for word in processed_text.split(" ") if word not in stop_words]
    return " ".join(filtered_words)

stop_words = set(nltk.corpus.stopwords.words('english'))

processed_dataset = base_dataset[[TITLE, F_CHECK]].copy(deep=True)

processed_dataset[TITLE] = processed_dataset[TITLE].apply(
    lambda n: clean_text(n, stop_words)
)

display(Markdown(base_dataset[[TITLE, F_CHECK]].head(5).to_markdown(index=False)))
display(Markdown(processed_dataset.head(5).to_markdown(index=False)))

| Title                                                               | Fact Check   |
|:--------------------------------------------------------------------|:-------------|
| The Story of Disney Deleting 1-800-SPANK-ME from 'The Santa Clause' | True         |
| Yes, LA County Granted a Young Girl's Request for a Unicorn License | True         |
| Are More People Killed By Donkeys Than Airplane Crashes Annually?   | False        |
| Can Poll Workers Invalidate Ballots by Writing on Them?             | False        |
| Did Trump Decry Low-Flow Showers and Dishwashers During a Pandemic? | True         |

| Title                                                     | Fact Check   |
|:----------------------------------------------------------|:-------------|
| story disney deleting 1800spankme santa clause            | True         |
| yes la county granted young girls request unicorn license | True         |
| people killed donkeys airplane crashes annually           | False        |
| poll workers invalidate ballots writing                   | False        |
| trump decry lowflow showers dishwashers pandemic          | True         |

## Creating datasets

In [11]:
raw_x_train, raw_x_test, y_train, y_test = train_test_split(
    processed_dataset[TITLE],
    processed_dataset[F_CHECK],
    test_size=0.5
)

# Exporting test dataset
export_df = pd.concat([raw_x_test, y_test], axis=1)
export_df.to_csv(join("data", "test_dataset.csv"), index=False)

print(f"Train sample size: {len(raw_x_train)}")
print(f"Test sample size: {len(raw_x_test)}")

print(raw_x_train.head(1))
print(raw_x_test.head(1))

Train sample size: 2929
Test sample size: 2930
2324    eric clapton unleash racist rant onstage
Name: Title, dtype: object
4405    hurricane hermine waterspout north carolina
Name: Title, dtype: object


## Formatting data

In [19]:
vectorizer = TfidfVectorizer()

x_train = vectorizer.fit_transform(raw_x_train)
x_test = vectorizer.transform(raw_x_test)


# Export Vectorizer
joblib.dump(vectorizer, join("models", "vectorizer.pkl"))

display(x_train)

<2929x6846 sparse matrix of type '<class 'numpy.float64'>'
	with 18319 stored elements in Compressed Sparse Row format>

#  Train Naive Bayes

In [20]:
gnb = GaussianNB()
gnb.fit(x_train.toarray(), y_train)

# Exporting Model

In [21]:
joblib.dump(gnb, join("models", "gnb_model.pkl"))

['models/gnb_model.pkl']