In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, log_loss, classification_report, accuracy_score,  roc_auc_score, roc_curve, auc
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
df_fake = pd.read_csv("/kaggle/input/full-dataset/Fake.csv")
df_true = pd.read_csv("/kaggle/input/full-dataset/True.csv")

In [None]:
df_fake['Label'] = 0
df_true['Label'] = 1


In [None]:
df_fake.shape, df_true.shape

In [None]:
j=0
for i in df_true['text']:
    if '(Reuters)' in i:
        j+=1    

In [None]:
j

In [None]:
df_manual=pd.DataFrame()
df_manual['title']=''
df_manual['text']=''
df_manual['subject']=''
df_manual['date']=''
df_manual['Label']=''


for i in range(516,526):
    df_manual.loc[len(df_manual)]= df_fake.loc[i]
    df_fake.drop(index=i, inplace=True, axis='index')
for i in range(487,498):
    df_manual.loc[len(df_manual)]= df_true.loc[i]
    df_true.drop(index=i, inplace=True, axis='index')

In [None]:
df_fake.shape, df_true.shape

In [None]:
final_data = pd.concat([df_fake, df_true], axis=0)
final_data

In [None]:
final_data = final_data.sample(frac=1).reset_index(drop=True)

In [None]:
final_data.drop_duplicates(inplace=True)

In [None]:
final_data.duplicated().sum()

In [None]:
def reuters_removal(text):
    reuters_text=''
    reuters_text= text.split()
    reuters_text= [word for word in reuters_text if not word=="(Reuters)"]
    reuters_text= [word for word in reuters_text if not word=="Reuters"]
    reuters_text= " ".join(reuters_text)
    return reuters_text

In [None]:
final_data['text']=final_data['text'].apply(reuters_removal)

In [None]:
final_data.isnull().sum()

In [None]:
final_data=final_data[["text","Label"]]
final_data

In [None]:
port_stemmer= PorterStemmer()

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
def stemmer(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ', content)
    stemmed_content= stemmed_content.lower()
    stemmed_content= stemmed_content.split()
    stemmed_content= [port_stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content= ' '.join(stemmed_content)
    return stemmed_content

In [None]:
final_data['text']= final_data['text'].apply(stemmer)

In [None]:
max_len=100
data_text=final_data["text"]
data_label=final_data["Label"]

In [None]:
! pip install transformers

In [None]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D,Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.python.ops.numpy_ops import np_utils
from transformers import BertModel, TFBertModel 
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import regularizers
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
from matplotlib import rcParams
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data_text, data_label, stratify = data_label, test_size = 0.3, random_state =42)

In [None]:
def tokenize(X):
    
    X = bert_tokenizer(
        text = list(X),
        add_special_tokens = True,
        max_length = 100,
        truncation = True,
        padding = 'max_length',
        return_tensors = 'tf',
        return_token_type_ids = False,
        return_attention_mask = True,
        verbose = True
        )
    return X

In [None]:
X_train_token = tokenize(X_train)
X_test_token = tokenize(X_test)

In [None]:
maxlen=100

In [None]:
from tensorflow.keras.regularizers import l2

In [None]:
def create_model():
    dropout_rate=0.2
    input_ids=Input(shape=(maxlen,),dtype=tf.int32)
    input_mask=Input(shape=(maxlen,),dtype=tf.int32)
    bert_layer=bert_model([input_ids,input_mask])[1]
    x=Dropout(0.5)(bert_layer)
    x=Dense(64, activation="tanh", kernel_regularizer=l2(0.01))(x)  
    x=Dropout(0.2)(x)
    x=Dense(1, activation="sigmoid", kernel_regularizer=l2(0.01))(x) 
    model = Model(inputs=[input_ids, input_mask], outputs=x)
    return model
    

In [None]:
model=create_model()
model.summary()

In [None]:
optimizer = Adam(learning_rate=1e-05, epsilon=1e-08, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='max', verbose=1, patience=7,baseline=0.4,min_delta=0.0001,restore_best_weights=False)

In [None]:
history = model.fit(x = {'input_1':X_train_token['input_ids'],'input_2':X_train_token['attention_mask']}, y = Y_train, epochs=10, validation_split = 0.3, batch_size = 30, callbacks=[callback])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_pred = np.where(model.predict({ 'input_1' : X_test_token['input_ids'] , 'input_2' : X_test_token['attention_mask']}) >=0.5,1,0)

In [None]:
!pip install mlxtend

In [None]:
from mlxtend.plotting import plot_confusion_matrix
conf_matrix = confusion_matrix(Y_test,y_pred)
fig, ax = plot_confusion_matrix(conf_mat=conf_matrix, figsize=(6, 6), cmap=plt.cm.Greens)
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
results = model.evaluate({'input_1': X_test_token['input_ids'], 'input_2': X_test_token['attention_mask']}, Y_test, verbose=0)
accuracy = results[1] * 100  
print(f'Total Accuracy: {accuracy:.2f}%')

In [None]:
!pip install pyLDAvis
!pip install gensim

In [None]:
import re
import string

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from pprint import pprint


In [None]:
!pip install --upgrade pyLDAvis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis 
    

In [None]:
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

In [None]:
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)


final_data['text'] = final_data['text'].apply(lambda x:expand_contractions(x))

In [None]:
final_data


In [None]:
tokeize_text = final_data.loc[:, 'text'] = final_data['text'].apply(lambda x : x.split())
id2word = corpora.Dictionary(tokeize_text)

In [None]:
texts = tokeize_text
corpus = [id2word.doc2bow(text) for text in texts]


corpus_example = [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]
corpus_example[0][:20]

In [None]:
n = 14
alpha = 0.5
beta = 0.5
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=n, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha=alpha,
                                           per_word_topics=True,
                                           eta = beta)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
print('\nPerplexity : ', lda_model.log_perplexity(corpus)) 

coherence_model_lda = CoherenceModel(model=lda_model, texts= final_data['text'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
def get_lda_features(texts, lda_model, dictionary):
    corpus = [dictionary.doc2bow(text.split()) for text in texts]
    topics = lda_model[corpus]
    lda_features = np.zeros((len(texts), lda_model.num_topics))
    for i, doc in enumerate(topics):
        for topic_num, prob in doc[0]:
            lda_features[i, topic_num] = prob
    return lda_features

In [None]:
train_lda_features = get_lda_features(X_train, lda_model, id2word)
test_lda_features = get_lda_features(X_test, lda_model, id2word)

In [None]:
import os

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
batch_size = 32

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_token['input_ids'], X_train_token['attention_mask']))
train_dataset = train_dataset.batch(batch_size)


In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_token['input_ids'], X_test_token['attention_mask']))
test_dataset = test_dataset.batch(batch_size)

In [None]:
train_bert_features = []
for batch in train_dataset:
    train_bert_features.append(bert_model([batch[0], batch[1]])[0])

In [None]:
test_bert_features = []
for batch in test_dataset:
    test_bert_features.append(bert_model([batch[0], batch[1]])[0])

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout

In [None]:
train_bert_features = np.vstack(train_bert_features)
test_bert_features = np.vstack(test_bert_features)

In [None]:
train_bert_features_cls = train_bert_features[:, 0, :]  
test_bert_features_cls = test_bert_features[:, 0, :]    

In [None]:
def concatenate_features(bert_features, lda_features):
    concatenated_features = np.concatenate([bert_features, lda_features], axis=1)
    return concatenated_features

In [None]:
train_features = concatenate_features(train_bert_features_cls, train_lda_features) 
test_features = concatenate_features(test_bert_features_cls, test_lda_features) 

In [None]:
from tensorflow.keras.layers import BatchNormalization

In [None]:
def build_combined_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Dense(128, kernel_regularizer=l2(0.02))(inputs)
    x = BatchNormalization()(x)
    x = tf.keras.activations.tanh(x)
    x = Dropout(0.5)(x)
    
    x = Dense(64, kernel_regularizer=l2(0.02))(x)
    x = BatchNormalization()(x)
    x = tf.keras.activations.tanh(x)
    x = Dropout(0.5)(x)
    
    outputs = Dense(num_classes, activation='sigmoid')(x)  # Use sigmoid for binary classification
    model = Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
input_shape = train_features.shape[1]
model = build_combined_model(input_shape, 1)  

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, baseline=0.4, min_delta=0.001,restore_best_weights=True)

In [None]:
optimizer = Adam(learning_rate=1e-04, epsilon=1e-08, clipnorm=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_features, Y_train, epochs=20,batch_size=32, validation_split=0.3,callbacks=[callback])

In [None]:
loss, accuracy = model.evaluate(test_features, Y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

In [None]:
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='test accuracy')
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper left')
plt.show()


In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='test loss')
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper left')
plt.show()