### Importing data


In [16]:
import os
import pandas as pd
from IPython.display import display, Markdown


LOCATION = os.getcwd()

# Dataset Columns
TITLE = "Title"
FACT_CHECK = "Fact Check"
BYLINE = "Byline"
DATE = "Date"
AUTHOR = "Author"
LINK = "Link"

dataset_df = pd.read_csv(os.path.join(LOCATION, "fake_news_dataset.csv"))

display(Markdown(dataset_df.head(1).to_markdown()))

|    | Title                                                               | Fact Check   | Byline                  | Date          | Author       | Link                                                                      |
|---:|:--------------------------------------------------------------------|:-------------|:------------------------|:--------------|:-------------|:--------------------------------------------------------------------------|
|  0 | The Story of Disney Deleting 1-800-SPANK-ME from 'The Santa Clause' | true         | "Ho ho ho," said Santa. | Dec. 23, 2022 | Jordan Liles | https://www.snopes.com/fact-check/santa-clause-deleted-scene-1800spankme/ |

### Verifying all fact check categories

In [42]:
group_by_factcheck = dataset_df.groupby(FACT_CHECK).size().reset_index(name="Count")

display(Markdown(group_by_factcheck.sort_values(["Count"], ascending=False).to_markdown(index=False)))

| Fact Check                                                          |   Count |
|:--------------------------------------------------------------------|--------:|
| false                                                               |    2501 |
| true                                                                |    1079 |
| miscaptioned                                                        |     493 |
| mixture                                                             |     470 |
| labeled satire                                                      |     394 |
| unproven                                                            |     255 |
| mostly false                                                        |     209 |
| scam                                                                |     197 |
| correct attribution                                                 |     132 |
| undefined                                                           |     122 |
| mostly true                                                         |     122 |
| fake                                                                |     112 |
| outdated                                                            |      80 |
| undetermined                                                        |      65 |
| misattributed                                                       |      55 |
| originated as satire                                                |      37 |
| unfounded                                                           |      23 |
| real photographs; inaccurate description                            |      22 |
| research in progress                                                |      18 |
| real photograph; inaccurate description                             |      17 |
| legend                                                              |      16 |
| real photograph;  inaccurate description                            |      15 |
| multiple — see below                                                |      12 |
| partly true                                                         |       9 |
| mixture:                                                            |       8 |
| real photographs;  inaccurate description                           |       8 |
| legit                                                               |       7 |
| not any more                                                        |       6 |
| hoax                                                                |       4 |
| real photos; inaccurate description                                 |       4 |
| incorrectly attributed                                              |       3 |
| real photo; inaccurate description                                  |       3 |
| probably false                                                      |       3 |
| real picture; inaccurate description                                |       3 |
| real pictures; inaccurate description                               |       2 |
| correctly attributed                                                |       2 |
| real                                                                |       2 |
| real fraud which costs its victims between $200 and $500            |       2 |
| multiple:                                                           |       2 |
| recall                                                              |       1 |
| resolved                                                            |       1 |
| two real entries, the others are fiction                            |       1 |
| sort of                                                             |       1 |
| real photograph;  false description                                 |       1 |
| false                                                               |       1 |
| real photograph                                                     |       1 |
| possible, but not common                                            |       1 |
| incomplete                                                          |       1 |
| mixed attribution                                                   |       1 |
| mixture of correct and  incorrect attributions                      |       1 |
| mixture of real and fake images                                     |       1 |
| mostly false:                                                       |       1 |
| multiple — see below:                                               |       1 |
| not quite                                                           |       1 |
| probably true                                                       |       1 |
| real photo; undetermined description                                |       1 |
| real fraud                                                          |       1 |
| real fraud which costs its victims around $250                      |       1 |
| real fraud which costs its victims hundreds to thousands of dollars |       1 |
| real fraud which costs its victims thousands of dollars             |       1 |
| real fraud which generally costs its victim between $100 and $1,000 |       1 |
| real fraud which generally costs its victims between $200 and $500  |       1 |
| real fraud, potential for financial harm unknown                    |       1 |
| was true; now outdated                                              |       1 |

In [53]:
def convert_check(string: str) -> str:
    mapping = {
        "correct attribution":      "true",
        "correctly attributed":     "true",
        "legit":                    "true",
        "miscaptioned":             "true",
        "mostly true":              "true",
        "not any more":             "true",
        "partly true":              "true",
        "probably true":            "true",
        "real":                     "true",
        "recall":                   "true",
        "resolved":                 "true",
        "scam":                     "true",
        "false description":        "false",
        "fake":                     "false",
        "hoax":                     "false",
        "inaccurate description":   "false",
        "incorrectly attributed":   "false",
        "labeled satire":           "false",
        "legend":                   "false",
        "misattributed":            "false",
        "mixture":                  "false",
        "mostly false":             "false",
        "mostly false:":            "false",
        "not quite":                "false",
        "originated as satire":     "false",
        "possible, but not common": "false",
        "probably false":           "false",
    } 
    
    return mapping.get(string, string)

normalized_df = dataset_df.copy(deep=True)
normalized_df[FACT_CHECK] = normalized_df[FACT_CHECK].apply(convert_check)
filtered_df = normalized_df[(normalized_df[FACT_CHECK] == "true") | (normalized_df[FACT_CHECK] == "false")]

print(f"Tabela não normalizada: {len(normalized_df)} registros")
print(f"Tabela normalizada: {len(filtered_df)} registros")

display(Markdown(
    filtered_df
    .groupby(FACT_CHECK)
    .size()
    .reset_index(name="Count")
    .to_markdown(index=False)
))

Tabela não normalizada: 6539 registros
Tabela normalizada: 5859 registros


| Fact Check   |   Count |
|:-------------|--------:|
| false        |    3807 |
| true         |    2052 |

In [54]:
filtered_df.to_csv(os.path.join(LOCATION, "final_fakenews_dataset.csv"), index=False)
