# Generate and Extract Dataset

- Este notebook crea los archivos necesarios para `model.ipynb`.
- Este notebook debe ser ejecutado primero.
- Los archivos generados son:
  - `vocab.json`: Diccionario de palabras. (Este archivo es requerido en el backend de la aplicación `./server`)
  - `features.csv`: Dataset limpio y procesado. (Este archivo es requerido en el notebook `model.json`)

## Load Datasets

In [None]:
import pandas as pd
import numpy as np

- **TRUE** : `1`
- **FAKE** : `0`

### [Fake News Dataset](https://data.mendeley.com/datasets/945z9xkc8d/1)

In [None]:
import os
!curl -L "https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/945z9xkc8d-1.zip" -o data.zip
!unzip "data.zip" && unzip "Fake News Dataset.zip"
!del -r *.zip

main_directory = os.path.join(os.getcwd(),"Fake News Dataset")
subdirectories = [x[0] for x in os.walk(main_directory)][1:]
dataframes = []
test = []
# Itera a través de los subdirectorios y archivos train.csv
for subdirectory in subdirectories:
    file_path = os.path.join(subdirectory, "train.csv")
    test_path = os.path.join(subdirectory, "test.csv")

    # Verifica si el archivo train.csv existe en el subdirectorio
    if os.path.exists(file_path):
        data = pd.read_csv(file_path, sep=";")
        dataframes.append(data)

    if os.path.exists(test_path):
        data = pd.read_csv(test_path, sep=";")
        test.append(data)

# Combina los DataFrames en uno solo
train_data = pd.concat(dataframes, ignore_index=True)
test_data = pd.concat(test,ignore_index=True)
# Guarda el DataFrame combinado en un archivo train_combined.csv
train_data.to_csv("train.csv", index=False)
test_data.to_csv("test.csv", index=False)
del train_data, test_data, test, dataframes, main_directory, subdirectories

## Inspeccionar datos

In [None]:
df_train = pd.read_csv("train.csv")
df_test= pd.read_csv("test.csv")
df = pd.concat([df_train,df_test])
# shuffle data
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df_train

In [None]:
df.info()

# Visualizacion del Dataset

## Dataset Balanceado

In [None]:
## Librerias para graficación
import matplotlib.pyplot as plt
import seaborn as sns

# Visualizamos si la data esta balanceada
sns.catplot(x="label", kind="count", color="r", data=df)
plt.title("Distribución de Clasificación")
plt.xlabel("(0) is true, (1) is false")
plt.ylabel("Conteo")

## Word Cloud de los titulares de las *Fake News*

In [None]:
# importing all necessary modules
from wordcloud import WordCloud, STOPWORDS
comment_words = ""
stopwords = set(STOPWORDS)
# Recorrer el dataframe
for val in df[df["label"] == 0]["text"][:100]:
    # Castear cada palabra a string
    val = str(val)
    # Separamos por palabra
    tokens = val.split()
    # Pasamos a minusculas
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    comment_words += " ".join(tokens) + " "
wordcloud = WordCloud(
    width=800,
    height=800,
    background_color="white",
    stopwords=stopwords,
    min_font_size=10,
).generate(comment_words)
# Generamos el plot
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
# importing all necessary modules
from wordcloud import WordCloud, STOPWORDS
comment_words = ""
stopwords = set(STOPWORDS)
# Recorrer el dataframe
for val in df[df["label"] == 1]["text"][:100]:
    # Castear cada palabra a string
    val = str(val)
    # Separamos por palabra
    tokens = val.split()
    # Pasamos a minusculas
    for i in range(len(tokens)):
        tokens[i] = tokens[i].lower()
    comment_words += " ".join(tokens) + " "
wordcloud = WordCloud(
    width=800,
    height=800,
    background_color="white",
    stopwords=stopwords,
    min_font_size=10,
).generate(comment_words)
# Generamos el plot
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

# Pre-Procesado de los datos para NLP

In [None]:
df.columns = ["features", "label"]

In [None]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

def processing_text(texto):
    
    processed_feature = decontract(texto)
    
    # Expresión regular para encontrar enlaces
    regex_enlaces = re.compile(r'https?://\S+|www.\S+', re.IGNORECASE)
    processed_feature = regex_enlaces.sub('', processed_feature) 

    # Remover con un expresión regular carateres especiales (no palabras).
    processed_feature = re.sub(r'[^a-zA-Z0-9 ]', '', str(processed_feature))
    
    # Remover números (Ocurrencias muy esporádicas en nuestro dataset)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature) 
    
    # Pasar todo el texto a minúsculas
    processed_feature = processed_feature.lower() 
    
    # Eliminar Stop words y realizar la lemmatizacion
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    processed_feature = processed_feature.split()
    processed_feature = ' '.join([lemmatizer.lemmatize(word) for word in processed_feature if word not in stop_words])

    # Remover ocurrencias de caracteres individuales
    processed_feature= re.sub(r'\b[a-zA-Z]\b', '', processed_feature) 
    
    # Simplificar espacios concecutivos a un único espacio entre palabras
    processed_feature = re.sub(' +', ' ', processed_feature) 
    return processed_feature

In [None]:
# Aplicamos la función de preprocesamiento a todo el dataset
df["features"] = df["features"].apply(processing_text)

# Save the data

In [None]:
!del *.csv
df.to_csv("features.csv", index=False) # index false means we don't want to save the index column