#Importing Python Libraries



---



In [None]:
!pip install transformers
!pip install pycaret
!pip install numpy 
!pip install pandas
!pip install tensorflow
!pip install kaggle
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

## Importing Dataset from Kaggle ( User needs a API Key)

In [None]:
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 93% 38.0M/41.0M [00:00<00:00, 127MB/s] 
100% 41.0M/41.0M [00:00<00:00, 118MB/s]


In [None]:
!unzip fake-and-real-news-dataset.zip

Archive:  fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


# Preprocessing The Data from the dataset



---



In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer

fake_news = pd.read_csv('Fake.csv')
fake_news = fake_news.drop(["subject","date"],axis=1)
fake_news['label'] = 'fake'


true_news = pd.read_csv('True.csv')
true_news = true_news.drop(["subject","date"],axis=1)
true_news['label'] = 'true'


data = pd.concat([fake_news, true_news], axis=0)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

data

Unnamed: 0,title,text,label
0,"Kim Davis Goes Off On Gay People, SCOTUS And ...",Just when you thought Kentucky County Clerk Ki...,fake
1,SEARING WaPo Op-Ed CRUSHES Rudy Giuliani’s Cl...,The rumors that Hillary Clinton is sick have b...,fake
2,Bombs kill Pakistani soldiers hunting U.S.-Can...,ISLAMABAD (Reuters) - Bomb blasts killed a Pak...,true
3,Turkey chides Arabs for 'weak' reaction ahead ...,ANKARA/ISTANBUL (Reuters) - Turkey criticized ...,true
4,WHY DEMOCRATS KEEP LOSING: Rep. Hakeem Jeffrie...,Hakeem Jeffries just exposed his Trump Derange...,fake
...,...,...,...
44893,Trump replaces chief of staff Priebus with ret...,WASHINGTON (Reuters) - President Donald Trump ...,true
44894,Senators press Trump for details on Icahn's sp...,(Reuters) - Six U.S. Senate Democrats on Thurs...,true
44895,Catalan leader Puigdemont to address Catalan p...,BARCELONA (Reuters) - Catalan leader Carles Pu...,true
44896,"Skirting Kurdish issue, France says Iraq's Aba...",PARIS (Reuters) - France appeared to backtrack...,true


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Create input sequences
input_ids = []
attention_masks = []
token_type_ids = []



for text in data['title']:
    encoded_dict = tokenizer.encode_plus(
                        text,                      
                        add_special_tokens = True, 
                        max_length = 256,          
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'tf',     
                   )
    
    # Add the encoded sequence to the input list
    input_ids.append(encoded_dict['input_ids'])
    
    # Add the attention mask to the attention mask list
    attention_masks.append(encoded_dict['attention_mask'])

    # Add the token type IDs to the token type ID list
    token_type_ids.append(encoded_dict['token_type_ids'])

# Convert the lists to tensors
input_ids = tf.concat(input_ids, axis=0)
attention_masks = tf.concat(attention_masks, axis=0)
token_type_ids = tf.concat(token_type_ids, axis=0)

# Create labels for the data
labels = []
for label in data['label']:
    if label == 'fake':
        labels.append(1)
    else:
        labels.append(0)

# Convert the labels to a tensor
labels = tf.convert_to_tensor(labels)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Model and Callbacks Definition



---



In [None]:
# Define the BERT model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Dropout
import tensorflow_hub as hub
from tensorflow.keras.models import Model


#preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
seq_length = 256


encoder_inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32),
)

#input = preprocess('Wow')



bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", trainable=False)
input_ids_in = layers.Input(shape=(seq_length,), dtype=tf.int32, name="input_ids")
input_mask_in = layers.Input(shape=(seq_length,), dtype=tf.int32, name="attention_mask")
segment_ids_in = layers.Input(shape=(seq_length,), dtype=tf.int32, name="token_type_ids")

bert_output = bert_layer(encoder_inputs)
pooled_output = bert_output["pooled_output"]
pooled_output = tf.keras.layers.Dropout(0.2)(pooled_output)

# Add additional hidden layers
hidden_layer = Dense(256, activation='relu')(pooled_output)
hidden_layer = Dropout(0.2)(hidden_layer)
hidden_layer = Dense(64, activation='relu')(hidden_layer)
hidden_layer = Dropout(0.2)(hidden_layer)
output_layer = Dense(1, activation='sigmoid')(hidden_layer)

# Create a Keras model
model = Model(inputs=encoder_inputs, outputs=output_layer)

# Compile the model
#optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='my_model_weights.h5', 
    save_best_only=True, 
    save_weights_only=True, 
    monitor='val_loss', 
    mode='min', 
    verbose=1
)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=5, 
    mode='min', 
    verbose=1
)
reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.2, 
    patience=3, 
    min_lr=0.0001, 
    mode='min', 
    verbose=1
)
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir='./logs', 
    histogram_freq=1, 
    write_graph=True, 
    write_images=True
)


# Training the model


---



In [None]:
model.fit(x={'input_word_ids': input_ids, 'input_mask': attention_masks, 'input_type_ids': token_type_ids}, y=labels, epochs=5, batch_size=40, validation_split=0.2, callbacks=[checkpoint_callback, early_stopping_callback, reduce_lr_callback, tensorboard_callback])


Epoch 1/5
Epoch 1: val_loss improved from inf to 0.29232, saving model to my_model_weights.h5
Epoch 2/5
Epoch 2: val_loss improved from 0.29232 to 0.22592, saving model to my_model_weights.h5
Epoch 3/5
Epoch 3: val_loss improved from 0.22592 to 0.22457, saving model to my_model_weights.h5
Epoch 4/5
Epoch 4: val_loss improved from 0.22457 to 0.21673, saving model to my_model_weights.h5
Epoch 5/5
Epoch 5: val_loss did not improve from 0.21673


<keras.callbacks.History at 0x7f3ea53da3d0>

# Saving the model


---



In [None]:
model.save('model.h5')

In [None]:
import tensorflow as tf

# Load the trained model from its weights
model = tf.keras.models.load_model('my_model_weights.h5')

In [None]:
!cp model.h5 drive/MyDrive/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
while True:pass