In [34]:
#Import the necessary libraries: pandas, matplotlib, and seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [35]:
#load the CSV file
df_fake = pd.read_csv("C:\\Users\\juane\\OneDrive\\Escritorio\\Datos\\Fake.csv")
df_true = pd.read_csv("C:\\Users\\juane\\OneDrive\\Escritorio\\Datos\\True.csv")

In [36]:
df_fake['label'] = 0  # 0 for fake news
df_true['label'] = 1  # 1 for true news

In [37]:
# Concatenate the fake and true news DataFrames, resetting the index
df = pd.concat([df_fake, df_true], ignore_index=True)

In [38]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [39]:
# Download the list of stopwords from the NLTK library and convert it to a set for faster lookup
import nltk

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words = set(stop_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
import string

def clean_word(word: str) -> str:
    word = word.lower() # Convert the word to lowercase
    word = word.strip() # Remove leading and trailing whitespace
    for letter in word:
        if letter in string.punctuation:
            word = word.replace(letter, '') # Remove punctuation from the word
    return word

def clean_text(text: str) -> list[str]:
    clean_text_list = []
    for word in text.split():
        cleaned_word = clean_word(word) # Clean each word in the text
        if cleaned_word not in stop_words:
            clean_text_list.append(cleaned_word) # Add the cleaned word to the list if it's not a stopword
    return clean_text_list


In [41]:
# Apply the clean_text function to the 'text' column and create a new 'clean_text' column
df["clean_text"] = df["text"].apply(clean_text)

In [42]:
import gensim

EMBEDDING_DIM = 100 # Set the dimensionality of the word vectors
sentences = df["clean_text"] # Use the 'clean_text' column as the input sentences for the Word2Vec model

model = gensim.models.Word2Vec(
    sentences=sentences, # Input sentences
    vector_size=EMBEDDING_DIM, # Dimensionality of the word vectors
    window=5, # Maximum distance between the current and predicted word within a sentence
    min_count=1 # Ignores all words with total frequency lower than this
)

In [43]:
# Vectorize text data using the Word2Vec model and convert DataFrame columns to numpy arrays for model training
import numpy as np

def vectorize_text(text: list[str]) -> np.ndarray:
    text_vector = np.zeros(EMBEDDING_DIM, np.float32)
    for word in text:
        if word in model.wv:
            word_vector = model.wv[word]
            text_vector += word_vector
    return text_vector

X = df["clean_text"].apply(vectorize_text)
X = np.array(X.tolist(), dtype=np.float32)
y = np.array(df["label"].to_list(), dtype=np.float32)

In [45]:
# Define the full path
file_path = "C:\\Users\\juane\\OneDrive\\Escritorio\\Datos\\features_labels.csv"

In [46]:
# Convertir las características a DataFrame
features_df = pd.DataFrame(X)

In [47]:
# Add the labels to the DataFrame
features_df['label'] = y

In [48]:
# Save the features and labels to a CSV file
features_df.to_csv(file_path, index=False)