# **Carregar dados dos datasets de notícias falsas e verdadeiras**

### Adiciona o caminho de src para permitir importações deste diretório

In [1]:
import os
import sys
from pathlib import Path

# Define o caminho base do projeto (um nível acima de /notebooks)
BASE_DIR = Path(__file__).resolve().parent.parent  if '__file__' in globals() else Path.cwd().parent

# Caminho para a pasta src/
SRC_DIR = BASE_DIR / "src"

# Adiciona SRC_DIR e BASE_DIR ao sys.path
sys.path.append(str(BASE_DIR))
sys.path.append(str(SRC_DIR))

### Importação dos datasets

In [2]:
from src.data_loader import load_dataset


# Caminhos dos datasets. É necessário que os arquivos dos datasets estejam na pasta data/raw/
# Se não estiverem, você deve criar a pasta e colocar os arquivos Fake.csv e True.csv lá.
if not os.path.exists(os.path.join(BASE_DIR, "data", "raw")):
    os.makedirs(os.path.join(BASE_DIR, "data", "raw"), exist_ok=True)
fake_dataset_path = os.path.join(BASE_DIR, "data", "raw", "Fake.csv")
real_dataset_path = os.path.join(BASE_DIR, "data", "raw", "True.csv")

fake_dataset, real_dataset, concat_dataset = load_dataset(fake_dataset_path, real_dataset_path)

### Exemplos de saídas de amostras dos datasets

In [3]:
print("Fake Dataset:")
print("Formato: ", fake_dataset.shape)
fake_dataset.sample(5)

Fake Dataset:
Formato:  (23481, 5)


Unnamed: 0,title,text,subject,date,label
17668,WATCH: ARROGANT Sexual Predator and Sometimes ...,The not so funny comedian Louis C.K. appeared ...,left-news,"Nov 9, 2017",0
13524,"DAY AFTER DALLAS COPS’ MEMORIAL, Obama Invites...","When America elected a Community Agitator, the...",politics,"Jul 13, 2016",0
2516,WATCH: Morning Joe Host Tells Kellyanne Conwa...,Kellyanne Conway is definitely going to whine ...,News,"February 15, 2017",0
223,Hustler Owner Is So Fed Up With Trump That He...,Donald Trump is a danger to the nation and the...,News,"October 14, 2017",0
16060,BREAKING: President Trump Makes FBI Pick One D...,President Trump has nominated Christopher Wray...,Government News,"Jun 7, 2017",0


In [4]:
print("Real Dataset:")
print("Formato: ", real_dataset.shape)
real_dataset.sample(5)

Real Dataset:
Formato:  (21417, 5)


Unnamed: 0,title,text,subject,date,label
15691,Thousands march against coal ahead of climate ...,FRANKFURT (Reuters) - Thousands of people took...,worldnews,"November 4, 2017",1
3269,About 19.5 million U.S. viewers watched Comey ...,(Reuters) - Former FBI Director James Comey’s ...,politicsNews,"June 9, 2017",1
15950,German railway under fire for proposal to name...,BERLIN (Reuters) - German rail operator Deutsc...,worldnews,"November 1, 2017",1
5901,Dozens barred from flying out of Istanbul airp...,ISTANBUL (Reuters) - Dozens of people were sto...,politicsNews,"January 29, 2017",1
603,U.S. Interior Secretary fails to provide paper...,WASHINGTON (Reuters) - The U.S. Interior Depar...,politicsNews,"November 16, 2017",1


In [5]:
print("Dataset Concatenado:")
print("Formato: ", concat_dataset.shape)
concat_dataset.sample(5)

Dataset Concatenado:
Formato:  (44898, 5)


Unnamed: 0,title,text,subject,date,label
20630,CONSERVATIVE MOM And Cruz Supporter Goes All I...,A message from a 100% FED UP mom to her Facebo...,left-news,"May 4, 2016",0
3536,New Poll Shows Republicans Hilariously Abando...,Republicans apparently have no idea what they ...,News,"December 6, 2016",0
27960,U.S. labor chief wants more drastic changes to...,WASHINGTON (Reuters) - Top U.S. labor leader R...,politicsNews,"April 4, 2017",1
30820,Pollsters who predicted Trump win benefit from...,NEW YORK (Reuters) - A handful of small public...,politicsNews,"November 11, 2016",1
1108,Christians Can Now Discriminate Against LGBT ...,It is now possible for religious adoption agen...,News,"June 16, 2017",0


# **Pré-processamento do dataset concatenado**

In [6]:
from src.preprocessing import preprocess_dataframe


processed_df = preprocess_dataframe(concat_dataset)

print("Dataset Processado:")
print("Formato: ", processed_df.shape)
processed_df.sample(5)

Dataset Processado:
Formato:  (44889, 7)


Unnamed: 0,title,text,subject,date,label,full_text,full_text_clean
2728,Democrats Launch Plan To Make Trump’s First S...,Alleged president Donald Trump has literally m...,News,"February 2, 2017",0,Democrats Launch Plan To Make Trump’s First S...,democrat launch plan make trump first speech c...
3363,Conservatives Demand Purge Of LGBT Employees ...,Conservatives apparently want a witch hunt in ...,News,"December 18, 2016",0,Conservatives Demand Purge Of LGBT Employees ...,conservative demand purge lgbt employee state ...
23069,Washington Post Deceives Public & Profits From...,21st Century Wire says The Washington Post has...,Middle-east,"January 7, 2017",0,Washington Post Deceives Public & Profits From...,washington post deceives public profit fake ne...
18853,WHY DEMOCRATS Can Thank HARRY REID For Replaci...,"In 2013, Senate Majority Leader Harry Reid use...",left-news,"Apr 7, 2017",0,WHY DEMOCRATS Can Thank HARRY REID For Replaci...,democrat thank harry reid replacing justice sc...
33247,Valeant to pay new CEO Papa base salary of $1....,(Reuters) - Drugmaker Valeant Pharmaceuticals ...,politicsNews,"April 27, 2016",1,Valeant to pay new CEO Papa base salary of $1....,valeant pay new ceo papa base salary million d...


# **Armazenamento do dataset concatenado processado**

In [7]:
# Salva o dataset processado em um arquivo CSV. É necessário que a pasta processed exista dentro de data.
if not os.path.exists(os.path.join(BASE_DIR, "data", "processed")):
    os.makedirs(os.path.join(BASE_DIR, "data", "processed"), exist_ok=True)
processed_df.to_csv(os.path.join(BASE_DIR, "data", "processed", "processed_dataset.csv"), index=False)