In [1]:
import os
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import mixed_precision
from sklearn.metrics import confusion_matrix, classification_report 
from transformers import BertTokenizer
import tensorflow_addons as tfa

os.chdir('C:/Users/makri/OneDrive/Documents/GitHub/Deep_Learning_Final_Project')

data_train = pd.read_csv('all_train(1).tsv', sep='\t')
data_test = pd.read_csv('all_test_public.tsv', sep='\t')
data_validate = pd.read_csv('all_validate.tsv', sep='\t')

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        # Enable memory growth for the first (and only) GPU
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print(f"Memory growth enabled for {gpus[0]}")
    except RuntimeError as e:
        print(e)  # This happens if GPUs are initialized before setting memory growth
else:
    print("No GPU found. Running on CPU.")

Memory growth enabled for PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
BATCH_SIZE = 16
SEED = 42
# Training data
X_train = data_train['clean_title'].values  
y_train = data_train['2_way_label'].values 

# Validation data
X_val = data_validate['clean_title'].values
y_val = data_validate['2_way_label'].values


In [4]:
X_train = [str(x) for x in X_train]
X_val = [str(x) for x in X_val]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(X_train, truncation=True, padding='max_length', max_length=30, return_tensors="tf")

val_encodings = tokenizer(
    X_val,
    truncation=True,
    padding='max_length',
    max_length=30,
    return_tensors="tf"
)



In [6]:
# Prepare dataset
inputs = {
    'input_word_ids': train_encodings['input_ids'],
    'input_mask': train_encodings['attention_mask'],
    'input_type_ids': train_encodings['token_type_ids']
}
labels = tf.cast(y_train, tf.float32)


val_inputs = {
    'input_word_ids': val_encodings['input_ids'],
    'input_mask': val_encodings['attention_mask'],
    'input_type_ids': val_encodings['token_type_ids']
}
val_labels = tf.cast(y_val, tf.float32)


# Now build dataset properly
train_ds = tf.data.Dataset.from_tensor_slices((inputs,  labels)).shuffle(buffer_size=len(X_train),seed=SEED).batch(BATCH_SIZE).cache().prefetch(tf.data.AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels))\
         .batch(BATCH_SIZE)\
         .prefetch(tf.data.AUTOTUNE)


In [7]:
# Build Model
mixed_precision.set_global_policy('mixed_float16')

# BERT encoder 
bert_model = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
    trainable=True
)


# Inputs
input_ids = tf.keras.Input(shape=(30,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.Input(shape=(30,), dtype=tf.int32, name="input_mask")
type_ids = tf.keras.Input(shape=(30,), dtype=tf.int32, name="input_type_ids")

bert_inputs = {
    'input_word_ids': input_ids,
    'input_mask': input_mask,
    'input_type_ids': type_ids
}

bert_outputs = bert_model(bert_inputs)
cls_token = bert_outputs['pooled_output']

x = tf.keras.layers.Dropout(0.1)(cls_token)
x = tf.keras.layers.Dense(1, activation='sigmoid', dtype='float32')(x)

model = tf.keras.Model(inputs=[input_ids, input_mask, type_ids], outputs=x)
model.summary()


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4050 Laptop GPU, compute capability 8.9
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask (InputLayer)        [(None, 30)]         0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 30)]         0           []                               
                                                                                                  
 input_word_ids (InputLayer)    [(None, 30)]         0           []                               
                                        

In [None]:
model.compile(loss = tf.keras.losses.BinaryCrossentropy(), optimizer = tf.keras.optimizers.Adam(2e-5), metrics = ['accuracy'])

In [None]:
history = model.fit(train_ds, validation_data=val_ds, epochs=1)


Epoch 1/2




Epoch 2/2


In [10]:

X_test = data_test['clean_title'].values
y_test = data_test['2_way_label'].values


X_test = [str(x) for x in X_test]

test_encodings = tokenizer(X_test, truncation=True, padding='max_length', max_length=30, return_tensors="tf")

# Prepare dataset
inputs_test = {
    'input_word_ids': test_encodings['input_ids'],
    'input_mask': test_encodings['attention_mask'],
    'input_type_ids': test_encodings['token_type_ids']
}

# Predictions
predictions = model.predict(dict(inputs_test))

threshold = 0.5
preds = (predictions>threshold).astype(int)

print(classification_report(preds,y_test, target_names = ['Fake','Real']))

              precision    recall  f1-score   support

        Fake       0.89      0.90      0.89     49634
        Real       0.88      0.87      0.88     42810

    accuracy                           0.89     92444
   macro avg       0.89      0.89      0.89     92444
weighted avg       0.89      0.89      0.89     92444



In [25]:
model.save('./my_model_bert_base')



INFO:tensorflow:Assets written to: ./my_model_bert_base\assets


INFO:tensorflow:Assets written to: ./my_model_bert_base\assets
