In [4]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# Load the datasets
true_news_path = './True.csv'
fake_news_path = './Fake.csv'

true_news_df = pd.read_csv(true_news_path)
fake_news_df = pd.read_csv(fake_news_path)

In [9]:
# combining datasets
true_news_df['label'] = 1  # Label for true news
fake_news_df['label'] = 0   # Label for fake news
combined_df = pd.concat([true_news_df, fake_news_df], ignore_index=True)

In [10]:
combined_df.sample(5)

Unnamed: 0,title,text,subject,date,label
13534,"France's Macron urges China, Russia to support...",PARIS (Reuters) - French President Emmanuel Ma...,worldnews,"November 29, 2017",1
40567,TREASON! HOW OBAMA’S SHADOW GOVERNMENT Is Comm...,PAUL SPERRY SPOKE WITH LOU DOBBS ON HIS REPORT...,left-news,"Feb 12, 2017",0
31306,WOW! HUNGARY’S PRIME MINISTER Follows Through ...,Hungary has slashed illegal immigration by ove...,politics,"Sep 16, 2017",0
44174,Black Politician Explains Why Left’s ‘Racist’ ...,Is the Democratic Party really the party of t...,Middle-east,"October 14, 2017",0
43105,WHY TAXPAYER FUNDED FOOD TRUCKS PLAN TO STALK ...,I wonder how many government funded trucks wil...,left-news,"May 29, 2015",0


In [11]:
# Define a function to clean the text
def clean_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply the cleaning function to the 'text' column
combined_df['cleaned_text'] = combined_df['text'].apply(clean_text)

In [12]:
combined_df['cleaned_text'][0]

'washington reuters head conservative republican faction us congress voted month huge expansion national debt pay tax cuts called fiscal conservative sunday urged budget restraint 2018 keeping sharp pivot way among republicans us representative mark meadows speaking cbs face nation drew hard line federal spending lawmakers bracing battle january return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy even november congressional election campaigns approach republicans seek keep control congress president donald trump republicans want big budget increase military spending democrats also want proportional increases nondefense discretionary spending programs support education scientific research infrastructure public health environmental protection trump administration already willing say going increase nondefense discretionary spending 7 percent meadows chairman small influential house freedom caucus said program democrats saying 

In [13]:
# Initialize the tokenizer
max_vocab_size = 10000
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(combined_df['cleaned_text'])

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(combined_df['cleaned_text'])

# Pad the sequences to ensure uniform input length
max_sequence_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Extract labels
labels = combined_df['label'].values

In [16]:
# Splitting into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Check dataset sizes
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 31428
Validation set size: 6735
Test set size: 6735


In [18]:
import numpy as np
import pandas as pd
import requests
import zipfile
import os
import gensim.downloader as api
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk

In [19]:
# Step 1.2: Download Pretrained Embeddings
# 1.2.1 Word2Vec
word2vec_model = api.load("word2vec-google-news-300")
print("Word2Vec model downloaded.")

Word2Vec model downloaded.


In [20]:
# 1.2.2 FastText
fasttext_model = api.load("fasttext-wiki-news-subwords-300")
print("FastText model downloaded.")

FastText model downloaded.


In [21]:
# 1.2.3 GloVe
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = "glove.6B.zip"

In [22]:
# Download GloVe embeddings
response = requests.get(glove_url)
with open(glove_zip_path, 'wb') as f:
    f.write(response.content)
print("GloVe embeddings downloaded.")

GloVe embeddings downloaded.


In [23]:
# Unzip the downloaded file
with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
    zip_ref.extractall()
print("GloVe embeddings extracted.")

GloVe embeddings extracted.


In [24]:
# Clean up the zip file
os.remove(glove_zip_path)

In [26]:
# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load GloVe embeddings (100d)
glove_embeddings = load_glove_embeddings('/content/glove.6B.100d.txt')

In [82]:
# Import the KeyedVectors class directly
from gensim.models import KeyedVectors

def create_embedding_matrix(embeddings, tokenizer):
    """
    Creates an embedding matrix for the given embeddings and tokenizer.

    Args:
        embeddings: A dictionary-like object containing word embeddings
                    or a KeyedVectors object.
        tokenizer: A Keras Tokenizer object.

    Returns:
        A NumPy array representing the embedding matrix.
    """
    # Check if the embeddings object is a KeyedVectors instance
    if isinstance(embeddings, KeyedVectors):
        embedding_dim = embeddings.vector_size  # Get embedding dimension
        embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
        for word, i in tokenizer.word_index.items():
            if word in embeddings:  # Check if word is in vocabulary
                embedding_matrix[i] = embeddings[word]
    else:  # Assume it's a dictionary-like object
        embedding_dim = len(next(iter(embeddings.values())))
        embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
        for word, i in tokenizer.word_index.items():
            if word in embeddings:
                embedding_matrix[i] = embeddings[word]

    return embedding_matrix

In [None]:
def create_embedding_matrix(embeddings, tokenizer):
    """
    Creates an embedding matrix for the given embeddings and tokenizer.

    Args:
        embeddings: A dictionary-like object containing word embeddings
                    or a KeyedVectors object or a numpy array.
        tokenizer: A Keras Tokenizer object.

    Returns:
        A NumPy array representing the embedding matrix.
    """
    # Check if the embeddings object is a KeyedVectors instance
    if isinstance(embeddings, KeyedVectors):
        embedding_dim = embeddings.vector_size  # Get embedding dimension
        embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
        for word, i in tokenizer.word_index.items():
            if word in embeddings:  # Check if word is in vocabulary
                embedding_matrix[i] = embeddings[word]
    # Check if the embeddings object is a numpy array
    elif isinstance(embeddings, np.ndarray):
        embedding_dim = embeddings.shape[1]  # Get embedding dimension from the array shape
        embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
        # Here, we assume the embeddings are aligned with tokenizer.word_index
        # i.e., embeddings[i] corresponds to the word with index i in tokenizer.word_index
        for word, i in tokenizer.word_index.items():
            if i < embeddings.shape[0]: # Make sure we don't go out of bounds
                embedding_matrix[i] = embeddings[i]
    else:  # Assume it's a dictionary-like object
        embedding_dim = len(next(iter(embeddings.values())))
        embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
        for word, i in tokenizer.word_index.items():
            if word in embeddings:
                embedding_matrix[i] = embeddings[word]

    return embedding_matrix

In [35]:
# Create embedding matrices
word2vec_matrix = create_embedding_matrix(word2vec_model, tokenizer)
fasttext_matrix = create_embedding_matrix(fasttext_model, tokenizer)
glove_matrix = create_embedding_matrix(glove_embeddings, tokenizer)

In [41]:
from gensim.models import Word2Vec

# Step 2: Custom-Trained Embeddings

# 2.1 Train Word2Vec
custom_word2vec_model = Word2Vec(sentences=combined_df['cleaned_text'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4)

In [43]:
# 2.2 Train FastText
from gensim.models import FastText # Import the FastText class
custom_fasttext_model = FastText(sentences=combined_df['cleaned_text'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4)

In [74]:
!pip install glove-python3

Collecting glove-python3
  Downloading glove_python3-0.1.0.tar.gz (326 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/327.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m235.5/327.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m327.0/327.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: glove-python3
  Building wheel for glove-python3 (setup.py) ... [?25l[?25hdone
  Created wheel for glove-python3: filename=glove_python3-0.1.0-cp310-cp310-linux_x86_64.whl size=1064167 sha256=cfcf56b2c7218a26c14629fcfa068f031ebb7c4f4b6a9b80e2f5ced799d2cf58
  Stored in directory: /root/.cache/pip/wheels/fe/2f/79/34314d44a0907e90e323c8c182ec23f126eb460829e02d98cf
Successfully built glove-python3
Installing collected packages: glove-python3
Succe

In [76]:
# 2.3 Train GloVe
from glove import Corpus, Glove # Import Corpus and Glove from glove
corpus = Corpus()
corpus.fit(combined_df['cleaned_text'].apply(lambda x: x.split()), window=5)
custom_glove_model = Glove(no_components=100, learning_rate=0.05)
custom_glove_model.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
custom_glove_model.add_dictionary(corpus.dictionary)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [78]:
# 2.4 Save the Custom-Trained Embeddings
custom_word2vec_model.save("custom_word2vec.model")
custom_fasttext_model.save("custom_fasttext.model")
custom_glove_model.save("custom_glove.model")

In [79]:
# Step 3: Comparison
# 3.1 Define a simple model to evaluate embeddings
def create_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False))
    model.add(LSTM(100, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(100))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [80]:
# 3.2 Evaluate Pretrained Embeddings
pretrained_models = {
    "Word2Vec": create_model(word2vec_matrix),
    "FastText": create_model(fasttext_matrix),
    "GloVe": create_model(glove_matrix)
}

for name, model in pretrained_models.items():
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"{name} Pretrained Model - Loss: {loss}, Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Word2Vec Pretrained Model - Loss: 0.0203133262693882, Accuracy: 0.9964365363121033
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
FastText Pretrained Model - Loss: 0.0762600302696228, Accuracy: 0.9839643836021423
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
GloVe Pretrained Model - Loss: 0.0265803299844265, Accuracy: 0.9956941604614258


In [None]:
# 3.3 Evaluate Custom-Trained Embeddings
custom_models = {
    "Custom Word2Vec": create_model(create_embedding_matrix(custom_word2vec_model.wv.vectors, tokenizer)),
    "Custom FastText": create_model(create_embedding_matrix(custom_fasttext_model.wv.vectors, tokenizer)),
    "Custom GloVe": create_model(create_embedding_matrix(custom_glove_model.word_vectors, tokenizer))
}

for name, model in custom_models.items():
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=32)
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"{name} Custom-Trained Model - Loss: {loss}, Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
  9/983 [..............................] - ETA: 1:53 - loss: 0.0256 - accuracy: 0.9965