# Generate and Extract Dataset

## Load Datasets

In [None]:
import pandas as pd
import numpy as np

In [None]:
DATASET = []

- **TRUE** : `0`
- **FAKE** : `1`

### [Fake News Dataset](https://data.mendeley.com/datasets/945z9xkc8d/1)

In [None]:
import os
!curl -L "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/945z9xkc8d-1.zip" -o data.zip
!unzip "data.zip" && unzip "Fake News Dataset.zip"
!rm -r *.zip

main_directory = os.path.join(os.getcwd(),"Fake News Dataset")
subdirectories = [x[0] for x in os.walk(main_directory)][1:]
dataframes = []
test = []
# Itera a través de los subdirectorios y archivos train.csv
for subdirectory in subdirectories:
    file_path = os.path.join(subdirectory, "train.csv")
    test_path = os.path.join(subdirectory, "test.csv")

    # Verifica si el archivo train.csv existe en el subdirectorio
    if os.path.exists(file_path):
        data = pd.read_csv(file_path, sep=";")
        dataframes.append(data)

    if os.path.exists(test_path):
        data = pd.read_csv(test_path, sep=";")
        test.append(data)

# Combina los DataFrames en uno solo
columns = ["features","label"]
train_data = pd.concat(dataframes, ignore_index=True)
train_data.columns = columns
train_data['features'] = train_data['features'].str.split('http').str[0]
train_data = train_data[train_data['features'] != ""]
test_data = pd.concat(test,ignore_index=True)
test_data.columns = columns
test_data['features'] = test_data['features'].str.split('http').str[0]
test_data = test_data[test_data['features'] != ""]
DATASET.append(train_data)
DATASET.append(test_data)

del train_data, test_data, test, dataframes, main_directory, subdirectories

### [ErfanMoosaviMonazzah/fake-news-detection-dataset-English](https://huggingface.co/datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English)

In [None]:
# Download the dataset
!curl -L "https://huggingface.co/api/datasets/ErfanMoosaviMonazzah/fake-news-detection-dataset-English/parquet/default/train/0.parquet" -o 0.parquet

In [None]:
# Read the Dataset
df = pd.read_parquet("0.parquet")
# Standardize the dataset
df["features"] = df["title"] + " " + df["text"]
df.drop(["title", "text", "Unnamed: 0", "date", "subject"], axis=1, inplace=True)
df.insert(0, "features", df.pop("features"))
# Add the dataset to the list
DATASET.append(df)
# Show
df.head(5)

In [None]:
!rm *.parquet

### [mohammadjavadpirhadi/fake-news-detection-dataset-english](https://huggingface.co/datasets/mohammadjavadpirhadi/fake-news-detection-dataset-english)

In [None]:
!curl -L "https://huggingface.co/api/datasets/mohammadjavadpirhadi/fake-news-detection-dataset-english/parquet/default/train/0.parquet" -o 0.parquet

In [None]:
# Read the Dataset
df = pd.read_parquet("0.parquet")
# Standardize the dataset
df["features"] = df["title"] + " " + df["text"]
df.drop(["title", "text", "date", "subject"], axis=1, inplace=True)
df.insert(0, "features", df.pop("features"))
df["label"] = df["label"].apply(lambda x: 1 if x == 0 else 0)

# Add the dataset to the list
DATASET.append(df)
# Show
df.head(5)

In [None]:
df["features"][1]

In [None]:
df[df["features"].str.contains("http")]

In [None]:
!rm *.parquet

### [pushpdeep/fake_news_combined](https://huggingface.co/datasets/pushpdeep/fake_news_combined)

In [None]:
!curl -L "https://huggingface.co/api/datasets/pushpdeep/fake_news_combined/parquet/default/train/0.parquet" -o 0.parquet

In [None]:
# Read the Dataset
df = pd.read_parquet("0.parquet")
# Standardize the dataset
df["features"] = df["text"]
df.drop(["Unnamed: 0", "text"], axis=1, inplace=True)
df.insert(0, "features", df.pop("features"))

# # Add the dataset to the list
DATASET.append(df)
# Show
df.head(5)

In [None]:
!rm *.parquet

### [argilla/news-fakenews](https://huggingface.co/datasets/argilla/news-fakenews)

In [None]:
!curl -L "https://huggingface.co/api/datasets/argilla/news-fakenews/parquet/default/train/0.parquet" -o 0.parquet

In [None]:
# Read the Dataset
df = pd.read_parquet("0.parquet")
# Standardize the dataset
df = df[["text","prediction"]]
df = df[df["text"] != ' ']
df["prediction"] = df["prediction"].apply(lambda x: 1 if x[0]["label"] == 'real' else 0)
df.columns = ["features", "label"]
# Add the dataset to the list
DATASET.append(df)
# Show
df.head(5)

In [None]:
!rm *.parquet

### Join all the data

In [None]:
df = pd.concat(DATASET, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True) # Shuffle the dataset
df.to_csv("train.csv", index=False)
del DATASET

## Inspeccionar datos

In [None]:
df.head(5)

In [None]:
df.info()

# Visualizacion del Dataset

## Dataset Balanceado

In [None]:
## Librerias para graficación
import matplotlib.pyplot as plt
import seaborn as sns

# Visualizamos si la data esta balanceada
sns.catplot(x="label", kind="count", color="r", data=df)
plt.title("Distribución de Clasificación")
plt.xlabel("is True")
plt.ylabel("Conteo")

# Pre-Procesado de los datos para NLP

In [None]:
df = pd.read_csv("train.csv")

# shuffle data
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df["features"][57072]

In [None]:
df[df["features"].str.contains('http')]

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

def processing_text(texto):
    
    processed_feature = decontract(texto)
    
    # Regular expression to find URLs
    regex_links = re.compile(r'https?://\S+|www.\S+', re.IGNORECASE)
    processed_feature = regex_links.sub('', processed_feature) 

    # Regular expression to find Twitter handles
    regex_handles = re.compile(r'\(@\w+\)', re.IGNORECASE)
    processed_feature = regex_handles.sub('', processed_feature)

    # Remove special characters with a regular expression.
    processed_feature = re.sub(r'[^a-zA-Z0-9 ]', '', str(processed_feature))
    
    # Remove numbers (very sporadic occurrences in our dataset)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature) 
    
    # Convert all text to lowercase
    processed_feature = processed_feature.lower() 
    
    # Remove stop words and perform lemmatization
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    processed_feature = processed_feature.split()
    processed_feature = ' '.join([lemmatizer.lemmatize(word) for word in processed_feature if word not in stop_words])

    # Remove occurrences of individual characters
    processed_feature= re.sub(r'\b[a-zA-Z]\b', '', processed_feature) 
    
    # Simplify consecutive spaces to a single space between words
    processed_feature = re.sub(' +', ' ', processed_feature) 
    return processed_feature

In [None]:
df["features"] = df["features"].apply(processing_text)

# Save the data

In [None]:
df.to_csv("features.csv", index=False)