<a href="https://colab.research.google.com/github/Lokkamithran/FYP/blob/main/MuRIL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gpustat
!gpustat

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!pwd
import os
os.chdir('/content/gdrive/My Drive/FYP_Colab/MuRIL')
!pwd

/content
/content/gdrive/My Drive/FYP_Colab/MuRIL


In [None]:
!pip install bert-for-tf2
!pip install sentencepiece



In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
# from tensorflow.keras.models import Model
from tqdm import tqdm
import numpy as np
import pandas as pd
from collections import namedtuple
from sklearn import preprocessing
from bert import bert_tokenization
import pickle

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)

TensorFlow Version: 2.15.0
Hub version:  0.16.1


In [None]:
# Load train and val datasets
df = pd.read_csv("../Final.csv", sep = "`")
# print(df.head())
df['Comment'].dropna(inplace=True)

In [None]:
# 8 class to binary

df = df.replace(to_replace=["Offensive_race", "Offensive_caste", "Offensive_handicapped", "Offensive_women",
                       "Offensive_religion", "Offensive_sexuality", "Offensive_others"], value="Offensive")
print(df['Fine Tag'].value_counts())

Fine Tag
Offensive        5159
Not_offensive    3663
Name: count, dtype: int64


In [None]:
# Prepare input text and one hot encoded labels for train and validation sets

unique_labels = list(np.unique(df["Fine Tag"]))
noUniqueLabels = len(unique_labels)

# total_x = np.array(df["Comment"])
# total_y = np.array(df["Fine Tag"])

total_x = df["Comment"].values
total_y = df["Fine Tag"].values

x_train, x_test, y_train, y_test = train_test_split(total_x, total_y, test_size=0.2, random_state=42)

# train_x = df_train["text"].values
# train_y = df_train["category"].values

le = preprocessing.LabelEncoder()

# fit = le.fit(y_train)
# y_train = fit.transform(y_train)

y_fit = le.fit(y_train)
np.save('muril_y_classes.npy', le.classes_)

y_train = y_fit.transform(y_train)
y_train = tf.keras.utils.to_categorical(y_train, num_classes = noUniqueLabels, dtype='float32')

# val_x = df_val["text"].values
# val_y = df_val["category"].values

# y_test = fit.transform(y_test)
y_test = y_fit.transform(y_test)
y_test = tf.keras.utils.to_categorical(y_test, num_classes = noUniqueLabels, dtype='float32')


print("Number of unique labels: ", noUniqueLabels)

Number of unique labels:  2


In [None]:
# Check unique labels
print(unique_labels)

['Not_offensive', 'Offensive']


In [None]:
# Function to create input_ids
def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens,)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

# Function to create attention masks
def get_masks(tokens, max_seq_length):
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

# Function to create segment ids
def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

# Function to create input_ids, attention_masks, segment_ids for sample
def create_single_input(sentence,MAX_LEN, MAX_SEQ_LEN):

  stokens = tokenizer.tokenize(sentence)

  stokens = stokens[:MAX_LEN]

  stokens = ["[CLS]"] + stokens + ["[SEP]"]

  ids = get_ids(stokens, tokenizer, MAX_SEQ_LEN)
  masks = get_masks(stokens, MAX_SEQ_LEN)
  segments = get_segments(stokens, MAX_SEQ_LEN)

  return ids,masks,segments

def create_input_array(sentences, MAX_SEQ_LEN):

  input_ids, input_masks, input_segments = [], [], []

  for sentence in tqdm(sentences,position=0, leave=True):

    ids,masks,segments=create_single_input(sentence,MAX_SEQ_LEN-2, MAX_SEQ_LEN)

    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)

  return [np.asarray(input_ids, dtype=np.int32),
            np.asarray(input_masks, dtype=np.int32),
            np.asarray(input_segments, dtype=np.int32)]

In [None]:
# MuRIL model layer
muril_layer = hub.KerasLayer("https://tfhub.dev/google/MuRIL/1", trainable=True)

# Create tokenizer
vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
# Create input_ids, attention_masks, segment_ids for training and validation sets with max_seq_len as 128
max_seq_len = 128
x_train_array = create_input_array(x_train, max_seq_len)
x_test_array = create_input_array(x_test, max_seq_len)

100%|██████████| 7057/7057 [00:04<00:00, 1570.04it/s]
100%|██████████| 1765/1765 [00:01<00:00, 1291.98it/s]


In [None]:
# print(x_train[0])
# print(x_train_array[2][0])

In [None]:
# Define model function - compile and fit
def model_fit(train_x, train_y, val_x, val_y, max_seq_length, num_epochs, muril_layer):

  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
  segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")

  outputs = muril_layer(dict(input_word_ids = input_word_ids, input_mask = input_mask, input_type_ids = segment_ids))

  x = tf.keras.layers.Dropout(0.1)(outputs["pooled_output"]) # take pooled output layer

#   print(x[0])
#   print(x[1])

  final_output = tf.keras.layers.Dense(noUniqueLabels, activation="softmax")(x)

  model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids], outputs=final_output)

  model.compile(loss='binary_crossentropy',
                  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-1),
                  metrics=['accuracy'])
  model.fit(train_x, train_y, epochs = num_epochs, batch_size = 32, validation_data = (val_x, val_y), shuffle = True)

  return model

In [None]:
# Set number of epochs
num_epochs = 1

# Get the model object
model = model_fit(x_train_array, y_train, x_test_array, y_test, max_seq_len, num_epochs, muril_layer)



In [None]:
# Make predictions
preds = model.predict(x_test_array)
preds = np.argmax(preds, axis=1)
print(preds)

[1 1 1 ... 1 1 1]


In [None]:
pickle.dump(model, open('muril_model.pkl', 'wb'))

In [None]:
pickled_model = pickle.load(open('muril_model.pkl', 'rb'))

In [None]:
# Make predictions
new_preds = pickled_model.predict(x_test_array)
new_preds = np.argmax(preds, axis=1)
print(new_preds)

In [None]:
# print(y_test)
# y_test = np.argmax(y_test, axis=1)
# preds = new_preds

accuracy = accuracy_score(y_test, preds)
f1 = f1_score(y_test, preds, average='weighted')

print(accuracy)
print(f1)

0.5809011051289317
0.42690348288736685


In [None]:
correct = 0
total = len(y_test)

for i in range(0, len(preds)):
    if(y_test[i][np.argmax(preds[i])] == 1):
        correct += 1

print("Test accuracy: ", round(correct/total, 4))

#Test accuracy for multinary doesn't seem to exceed .4915
#And it also predicts the same vector for EVERY test comment with the highest prob. for "Not_offensive"

#And .5515 for binary classification with 0.001 lr :(
#.4522 with 0.1 learning rate for Adam

Test accuracy:  0.5809


In [None]:
# print(x_test[1], " ", y_test[1])

trailer la vidya balan missing.... why?   [1. 0. 0. 0. 0. 0. 0. 0.]
