# **Carregar dados dos datasets de notícias falsas e verdadeiras**

### Adiciona o caminho de src para permitir importações deste diretório

In [1]:
import os
import sys
from pathlib import Path

# Define o caminho base do projeto (um nível acima de /notebooks)
BASE_DIR = Path(__file__).resolve().parent.parent  if '__file__' in globals() else Path.cwd().parent

# Caminho para a pasta src/
SRC_DIR = BASE_DIR / "src"

# Adiciona SRC_DIR e BASE_DIR ao sys.path
sys.path.append(str(BASE_DIR))
sys.path.append(str(SRC_DIR))

### Importação dos datasets

In [2]:
from src.data_loader import load_dataset


# Caminhos dos datasets. É necessário que os arquivos dos datasets estejam na pasta data/raw/
# Se não estiverem, você deve criar a pasta e colocar os arquivos Fake.csv e True.csv lá.
if not os.path.exists(os.path.join(BASE_DIR, "data", "raw")):
    os.makedirs(os.path.join(BASE_DIR, "data", "raw"), exist_ok=True)
fake_dataset_path = os.path.join(BASE_DIR, "data", "raw", "Fake.csv")
real_dataset_path = os.path.join(BASE_DIR, "data", "raw", "True.csv")

fake_dataset, real_dataset, concat_dataset = load_dataset(fake_dataset_path, real_dataset_path)

### Exemplos de saídas de amostras dos datasets

In [3]:
print("Fake Dataset:")
print("Formato: ", fake_dataset.shape)
fake_dataset.sample(5)

Fake Dataset:
Formato:  (23481, 5)


Unnamed: 0,title,text,subject,date,label
18947,BREAKING: STARBUCKS CEO To Step Down After Ple...,The arrogance of a CEO of a major company like...,left-news,"Mar 19, 2017",0
19220,“REVEREND” AL SHARPTON Makes Up Lie About Jesu...,Hey Al you do realize that Obama was more than...,left-news,"Jan 30, 2017",0
22242,SUNDAY SCREENING: Counter Intelligence – ‘The ...,21st Century Wire says Our weekly documentary ...,US_News,"January 29, 2017",0
3209,This Compilation Of Alec Baldwin HUMILIATING ...,Donald Trump threw a party to celebrate the Ne...,News,"December 31, 2016",0
9372,WHAT TRUMP JUST SAID Should Scare All Gropers ...,President Trump wants those who settled sexual...,politics,"Nov 21, 2017",0


In [4]:
print("Real Dataset:")
print("Formato: ", real_dataset.shape)
real_dataset.sample(5)

Real Dataset:
Formato:  (21417, 5)


Unnamed: 0,title,text,subject,date,label
1933,House speaker urges Trump not to scrap 'Dreame...,WASHINGTON (Reuters) - House of Representative...,politicsNews,"September 1, 2017",1
5343,EU's Tusk says agreed with U.S. Pence on need ...,BRUSSELS (Reuters) - European Council Presiden...,politicsNews,"February 20, 2017",1
20769,UK Brexit minister says 'good prospect' of agr...,LONDON (Reuters) - There is a good prospect Br...,worldnews,"September 7, 2017",1
13593,Britain close to deal on Brexit bill with EU -...,BRUSSELS/LONDON (Reuters) - Britain has offere...,worldnews,"November 29, 2017",1
4645,Trump son-in-law met executives of sanctioned ...,MOSCOW/WASHINGTON (Reuters) - A Russian bank ...,politicsNews,"March 27, 2017",1


In [5]:
print("Dataset Concatenado:")
print("Formato: ", concat_dataset.shape)
concat_dataset.sample(5)

Dataset Concatenado:
Formato:  (44898, 5)


Unnamed: 0,title,text,subject,date,label
16839,DID IRAN RELEASE This Footage Of Captured U.S....,And what about the female sailor in the headsc...,Government News,"Jan 14, 2016",0
19804,BEN CARSON DESTROYS Interrupting Anti-Trump MS...,Notify the CDC. It's spreading. #BenCarson #Mo...,left-news,"Oct 14, 2016",0
40874,New Zealand likely to announce new government ...,WELLINGTON (Reuters) - New Zealand acting Prim...,worldnews,"October 15, 2017",1
59,"While Honoring Native American Code Talkers, ...",Former reality show star Donald Trump just can...,News,"November 27, 2017",0
24710,Republican Senator Collins likely 'yes' vote t...,WASHINGTON (Reuters) - U.S. Republican Senator...,politicsNews,"October 15, 2017",1


# **Pré-processamento do dataset concatenado**

In [6]:
from src.preprocessing import preprocess_dataframe


processed_df = preprocess_dataframe(concat_dataset)

print("Dataset Processado:")
print("Formato: ", processed_df.shape)
processed_df.sample(5)

Dataset Processado:
Formato:  (44898, 7)


Unnamed: 0,title,text,subject,date,label,full_text,full_text_clean
40112,Ukraine airport says tightened security after ...,KIEV (Reuters) - Ukraine s Odessa airport said...,worldnews,"October 24, 2017",1,Ukraine airport says tightened security after ...,ukraine airport say tightened security cyber a...
36028,Exclusive: Detained Saudi billionaire al-Sanea...,DUBAI (Reuters) - A detained Saudi billionaire...,worldnews,"December 11, 2017",1,Exclusive: Detained Saudi billionaire al-Sanea...,exclusive detained saudi billionaire alsanea s...
13303,LIBERTARIAN Gary Johnson ENDORSES Black Lives ...,Johnson calls Obama s manufactured race war/wa...,politics,"Aug 8, 2016",0,LIBERTARIAN Gary Johnson ENDORSES Black Lives ...,libertarian gary johnson endorses black life m...
32102,UAW chief says Clinton told him she would rene...,(Reuters) - United Auto Workers President Denn...,politicsNews,"July 26, 2016",1,UAW chief says Clinton told him she would rene...,uaw chief say clinton told would renegotiate n...
12870,FBI DATA Shows Black-On-Black Murders Off The ...,America has never seen two people living in th...,politics,"Oct 1, 2016",0,FBI DATA Shows Black-On-Black Murders Off The ...,fbi data show blackonblack murder chart obama ...


# **Armazenamento do dataset concatenado processado**

In [7]:
import pandas as pd

# Salva o dataset processado em um arquivo CSV. É necessário que a pasta processed exista dentro de data.
if not os.path.exists(os.path.join(BASE_DIR, "data", "processed")):
    os.makedirs(os.path.join(BASE_DIR, "data", "processed"), exist_ok=True)
processed_df.to_csv(os.path.join(BASE_DIR, "data", "processed", "processed_dataset.csv"), index=False)