In [1]:
# importing dependencies 
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import os


In [2]:
# Reading and preparing the files

def read_file(file_name):
    with open(file_name, "r", encoding='utf-8', errors='ignore') as file:
        return file.read()

# Base directory containing all labeled folders
base_dir = "/Users/jeromejaggi/Desktop/MSc Dissertation/6_Data"

# List of labels/folders
folder_names = ["1_public", "2_Acquired", "3_Venture", "4_Closed"]
labels = {name: idx for idx, name in enumerate(folder_names)}

# Read each txt file and store the content and the label in a list of tuples
data = []
for folder_name in folder_names:
    folder_path = os.path.join(base_dir, folder_name)
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            content = read_file(file_path)
            data.append((content, labels[folder_name]))



In [3]:
# Test/Train Split

# Convert the list of tuples into a DataFrame
data_df = pd.DataFrame(data, columns=["content", "label"])

# Split the DataFrame into a training set and a validation set
train_df, val_df = train_test_split(data_df, test_size=0.2) #test size is the percentage of test data

# Extract the content and the labels from the training set and the validation set
train_texts, train_labels = train_df["content"].tolist(), train_df["label"].tolist()
val_texts, val_labels = val_df["content"].tolist(), val_df["label"].tolist()


In [4]:
# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 512 

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_length)



In [5]:
# Prepare Dataset for Tensorflow
train_encodings = {key: tf.constant(val) for key, val in train_encodings.items()}
val_encodings = {key: tf.constant(val) for key, val in val_encodings.items()}

train_labels = tf.constant(train_labels)
val_labels = tf.constant(val_labels)

train_dataset = tf.data.Dataset.from_tensor_slices((
    train_encodings,
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    val_encodings,
    val_labels
))



2023-08-22 14:04:20.506178: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Max
2023-08-22 14:04:20.506196: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-08-22 14:04:20.506202: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-08-22 14:04:20.506742: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-08-22 14:04:20.507123: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [6]:
# Inspect dataset and inputs

for x, y in train_dataset.take(5):
    print(x)
    print(y) 

{'input_ids': <tf.Tensor: shape=(512,), dtype=int32, numpy=
array([  101,  7065,  4747,  4904,  3167,  2769,  6112, 28494,  2358,
       14604, 25655,  1009,  4008, 28601,  2581, 18164, 15136,   999,
       28494,  1030,  7065,  4747,  4904,  1012,  4012,  3291,  5938,
        1998,  6016,  2769,  6917, 19237,  6450,  4297,  2239,  8159,
       11638,  1004, 13583,  2100,  2023,  3068,  2003,  2025, 20275,
        2664,  1012,  3006,  2946,  2892,  1011,  9598,  2439,  2349,
        2000,  9883, 19284, 11817,  1004, 20861,  1004,  4654,  4502,
        3215,  1002,  1015,  1010,  1018, 23458,  2866,  2003,  3988,
        4539,  3006,  1024,  1002,  1017,  8428,  3006,  2946,  1004,
        3438, 19968,  2078, 15183,  5576, 11320,  3863,  4604, 10882,
        2063,  1015,  1011,  3371,  4654, 24759,  5162,  7062,  2678,
        2449,  2944,  2489,  9598,  3863,  1004,  2769, 15210,  2039,
        2000,  3156,  7038,  2065,  2017,  2342,  2062,  1010, 13260,
        2814,  2030,  5454,  2

In [7]:
# Check data types

for inputs, labels in train_dataset.take(1):
    print(inputs['input_ids'].dtype)
    print(labels.dtype)

<dtype: 'int32'>
<dtype: 'int32'>


In [10]:
# Load model with classification layer and compilte it with Adam optimizer

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.compile(optimizer=Adam(3e-5))

# Train the model, shuffling enabled to prevent learning any unintended sequence patterns
history = model.fit(train_dataset.shuffle(1000).batch(16), epochs=1, batch_size=16, validation_data=val_dataset.batch(16))


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2023-08-24 23:08:02.501806: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-08-24 23:10:30.121810: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


