In [None]:
.import numpy as np
import pandas as pd
import pickle
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import random
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
true = pd.read_csv('/content/drive/MyDrive/True.csv')
fake = pd.read_csv('/content/drive/MyDrive/Fake.csv')

In [None]:
true['label'] = 1
fake['label'] = 0

In [None]:
frames = [true.loc[:][:5000], fake.loc[:][:5000]]
df = pd.concat(frames)

In [None]:
X = df['text']
y = df['label']
corpus = df['text'].values
X

Unnamed: 0,text
0,WASHINGTON (Reuters) - The head of a conservat...
1,WASHINGTON (Reuters) - Transgender people will...
2,WASHINGTON (Reuters) - The special counsel inv...
3,WASHINGTON (Reuters) - Trump campaign adviser ...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...
...,...
4995,It s no secret Republicans are salivating to f...
4996,Republicans are about to lose a huge source of...
4997,A pawn working for Donald Trump claimed that w...
4998,Fox News is desperate to sabotage Hillary Clin...


In [None]:
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = ''.join([word for word in text if word not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

# Pad the sequences to a fixed length
max_length = 100
X_train = pad_sequences(X_train, maxlen=max_length, padding='post')
X_test = pad_sequences(X_test, maxlen=max_length, padding='post')

# Train the Word2Vec model
sentences = [sentence.tolist() for sentence in X_train]
w2v_model = Word2Vec(sentences, vector_size=4, window=4, min_count=4, workers=4)

embedding_matrix = np.zeros((vocab_size, 4))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [None]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = [[w2v_model.wv[i] for i in ls if i in words]
                         for ls in X_train]
X_test_vect = [[w2v_model.wv[i] for i in ls if i in words]
                         for ls in X_test]

In [None]:
X_train_vect_avg = []
for v in X_train_vect:
    v = np.array(v)
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    v = np.array(v)
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
import tensorflow.keras.backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
from keras import Sequential, optimizers
from keras.layers import Dropout, BatchNormalization
optm = optimizers.Adam(learning_rate=0.001)
model_cnn = Sequential()
model_cnn.add(Embedding(vocab_size, 4, weights=[embedding_matrix]))
model_cnn.add(Conv1D(128, 2, activation='tanh'))
model_cnn.add(MaxPooling1D(2, padding='same'))
model_cnn.add(BatchNormalization())
model_cnn.add(Flatten())
model_cnn.add(Dense(128, activation='tanh'))
model_cnn.add(Dense(1, activation='sigmoid'))

model_cnn.compile(optimizer=optm, loss='binary_crossentropy', metrics=['accuracy'])
model_cnn.fit(np.array(X_train_vect_avg), y_train, epochs=10, batch_size=32, validation_data=(np.array(X_test_vect_avg), y_test))

Epoch 1/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.8673 - loss: 0.3678 - val_accuracy: 0.4915 - val_loss: 0.6398
Epoch 2/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9033 - loss: 0.2820 - val_accuracy: 0.8997 - val_loss: 0.4330
Epoch 3/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9026 - loss: 0.2724 - val_accuracy: 0.9003 - val_loss: 0.2906
Epoch 4/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9086 - loss: 0.2621 - val_accuracy: 0.9121 - val_loss: 0.2901
Epoch 5/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9115 - loss: 0.2608 - val_accuracy: 0.9133 - val_loss: 0.2514
Epoch 6/10
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9125 - loss: 0.2577 - val_accuracy: 0.9142 - val_loss: 0.2540
Epoch 7/10
[1m210/210[0m

<keras.src.callbacks.history.History at 0x7da6c4acf820>

In [None]:
model_fnn = Sequential()
model_fnn.add(Dense(512, activation='tanh', input_dim=np.array(X_train_vect_avg).shape[1]))
model_fnn.add(Dropout(0.4))
model_fnn.add(Dense(256, activation='tanh'))
model_fnn.add(Dropout(0.2))
model_fnn.add(Dense(128, activation='tanh'))
model_fnn.add(Dropout(0.1))
model_fnn.add(Dense(1, activation='sigmoid'))

model_fnn.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
model_fnn.summary()

In [None]:
model_fnn.fit(np.array(X_train_vect_avg), y_train, epochs=11, batch_size=32, validation_data=(np.array(X_test_vect_avg), y_test))

Epoch 1/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9503 - loss: 0.1281 - val_accuracy: 0.9764 - val_loss: 0.0687
Epoch 2/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9754 - loss: 0.0651 - val_accuracy: 0.9727 - val_loss: 0.0716
Epoch 3/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9737 - loss: 0.0739 - val_accuracy: 0.9785 - val_loss: 0.0679
Epoch 4/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.9791 - loss: 0.0597 - val_accuracy: 0.9767 - val_loss: 0.0700
Epoch 5/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9781 - loss: 0.0704 - val_accuracy: 0.9788 - val_loss: 0.0632
Epoch 6/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9777 - loss: 0.0602 - val_accuracy: 0.9776 - val_loss: 0.0645
Epoch 7/11
[1m210/210[0m

<keras.src.callbacks.history.History at 0x7da6c8b3a500>

In [None]:
from tensorflow.keras.layers import LSTM, Bidirectional, SpatialDropout1D

model1 = Sequential()
model1.add(Embedding(vocab_size, 4))
model1.add(SpatialDropout1D(0.2))
model1.add(LSTM(128, activation = 'tanh', return_sequences = False))
model1.add(Dropout(0.2))
model1.add(BatchNormalization())
model1.add(Dense(1, activation='sigmoid'))

model1.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [None]:
model1.fit(np.array(X_train_vect_avg), y_train, epochs=11, batch_size=32, validation_data=(np.array(X_test_vect_avg), y_test))

Epoch 1/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.8329 - loss: 0.4357 - val_accuracy: 0.5085 - val_loss: 0.5893
Epoch 2/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9019 - loss: 0.3049 - val_accuracy: 0.8858 - val_loss: 0.4410
Epoch 3/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9129 - loss: 0.2741 - val_accuracy: 0.9121 - val_loss: 0.2536
Epoch 4/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8976 - loss: 0.2972 - val_accuracy: 0.8927 - val_loss: 0.2791
Epoch 5/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9017 - loss: 0.2856 - val_accuracy: 0.9124 - val_loss: 0.2525
Epoch 6/11
[1m210/210[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9104 - loss: 0.2781 - val_accuracy: 0.9127 - val_loss: 0.2721
Epoch 7/11
[1m210/210

<keras.src.callbacks.history.History at 0x7da6c183b790>

In [None]:
model_bil = Sequential()
model_bil.add(Embedding(vocab_size, 4))
model_bil.add(SpatialDropout1D(0.2))
model_bil.add(
    Bidirectional(LSTM(64, return_sequences=True, activation = 'tanh', input_shape=np.array(X_train_vect_avg).shape[1]))
)
model_bil.add(Dropout(0.2))
model_bil.add(BatchNormalization())
model_bil.add(Dense(1, activation='sigmoid'))

model_bil.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])


In [None]:
model_bil.fit(np.array(X_train_vect_avg), y_train, epochs=11, batch_size=32, validation_data=(np.array(X_test_vect_avg), y_test))