<a href="https://colab.research.google.com/github/dragonsan17/faq_retrieval_deep_learning/blob/main/transformers_workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

This notebook has cells to run 3 transformer models, BERT, MuRIL and IndicBERT. Both use different tokenization methods hence we will have copied contents from preprocess_data.py and for evaluation, we have contents form calculate_performance.py

# Imports and Repo Downloading

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip3 install sentencepiece #for IndicBERT
!pip install bert-for-tf2 #for MuRIL
!pip install tensorflow-text #for MuRIL

import os
from getpass import getpass
import urllib
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 1000)
from IPython.display import display
from collections import Counter
import warnings
warnings.filterwarnings('ignore') 

from transformers import TFBertModel, BertTokenizer, TFAutoModel, AutoTokenizer
import tensorflow as tf
from preprocess_data import preprocess_text
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text
from bert import bert_tokenization
import random

In [None]:
# Enter Username, Password and Repo name which will then download all the repo contents here in colab.
# You can then access and run all files of the same. This is to get an idea of how the code works in a cloud environment
# source : https://stackoverflow.com/questions/48350226/methods-for-using-git-with-google-colab
user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password)

# repo_name = input('Repo name: ')
repo_name = 'faq_retrieval_deep_learning'
cmd_string = 'git clone https://{0}:{1}@github.com/{0}/{2}.git'.format(user, password, repo_name)
os.system(cmd_string)
cmd_string, password = "", "" 

%cd faq_retrieval_deep_learning

import config

# Data Loading and Preprocessing

In [None]:
df_all_data = pd.read_csv('data/all_data.csv', encoding = 'utf-8')
df_test = pd.read_csv('data/test.csv', encoding = 'utf-8')
df_train = pd.read_csv('data/train.csv', encoding = 'utf-8')

In [None]:
COLUMN_NAMES = config.COLUMN_NAMES
TRAIN_COLUMN = config.TRAIN_COLUMN

def preprocess_text(tokenizer, ques1, ques2):
    input_ids = []
    segment_ids = []
    attention_masks = []
    for (q1,q2) in zip(ques1, ques2):
        q1 = '[CLS] ' + q1 + ' [SEP] '
        q2 = q2 + ' [SEP] '

        token_q1 = tokenizer.tokenize(q1)
        token_q2 = tokenizer.tokenize(q2)

        token = token_q1 + token_q2
        segment_id = [0] * len(token_q1) + [1] * len(token_q2)
        attention_mask = [1]*len(segment_id)

        input_id = tokenizer.convert_tokens_to_ids(token)

        input_ids.append(input_id)
        segment_ids.append(segment_id)
        attention_masks.append(attention_mask)

    input_ids = np.array(pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post"))  
    segment_ids = np.array(pad_sequences(segment_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post"))
    attention_masks = np.array(pad_sequences(attention_masks, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")) 

    return input_ids, segment_ids, attention_masks

def scheduler(epoch, lr):
  if epoch <= EPOCH_NUM/2:
    return LEARNING_RATE_1
  else:
    return LEARNING_RATE_2

callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

train_data = pd.read_csv('data/train.csv')
train_labels = train_data['label']
one_hot_label = np.zeros((len(train_labels),2))

for i in range(len(train_labels)):
  if(train_labels[i]==0):
    one_hot_label[i] = [1,0]
  else:
    one_hot_label[i] = [0,1]

test_input_ids, test_segment_ids, test_attention_masks = preprocess_text(tokenizer, df_test['q1'], df_test['q2'])

# Models

In [None]:
"""
  BERT
"""

def build_model():
    
    input_ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    segment_ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    attention_masks = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    bert_model = TFBertModel.from_pretrained(pretrained_model_name_or_path = 'bert-base-multilingual-cased', return_dict=True)
    x = bert_model(input_ids,attention_mask=attention_masks,token_type_ids  = segment_ids).pooler_output
    x1 = tf.keras.layers.Dropout(0.1)(x) 
    x1 = tf.keras.layers.Dense(2)(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks, segment_ids], outputs=[x1])
    optimizer = tf.keras.optimizers.Adam(learning_rate= LEARNING_RATE_1)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])
    return model, BertTokenizer.from_pretrained('bert-base-multilingual-cased')

#training
model, tokenizer = build_model()
train_input_ids, train_segment_ids, train_attention_masks = preprocess_text(tokenizer, train_data['q1'].values, train_data['q2'].values)
model.fit([train_input_ids, train_attention_masks, train_segment_ids], one_hot_label, epochs=EPOCH_NUM, batch_size=BATCH_SIZE, callbacks=[callback], verbose=1)

def give_batch(i1, i2, i3, n=1):
    l = len(i1)
    for index in range(0, l, n):
        yield [i1[index:min(index + n, l)], i2[index:min(index + n, l)], i3[index:min(index + n, l)]]

#testing
test_outputs = []
for batch in give_batch(test_input_ids, test_attention_masks, test_segment_ids, 50):
    test_outputs.extend(model(batch, training=False))

df_test['positive_score'] = np.array(test_outputs)[:,1]

In [None]:
# """
#   IndicBERT
# """

# MAX_LEN = 256
# LEARNING_RATE_1 = 2e-6
# LEARNING_RATE_2 = 2e-5
# EPOCH_NUM = 6
# BATCH_SIZE = config.BATCH_SIZE

# def build_model():
    
#     input_ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
#     segment_ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
#     attention_masks = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    
#     tokenizer = AutoTokenizer.from_pretrained('neuralspace-reverie/indic-transformers-hi-bert')
#     indic_model = TFAutoModel.from_pretrained('neuralspace-reverie/indic-transformers-hi-bert')

#     x = indic_model(input_ids,attention_mask=attention_masks,token_type_ids  = segment_ids).pooler_output
#     x1 = tf.keras.layers.Dropout(0.1)(x) 
#     x1 = tf.keras.layers.Dense(2)(x1)
#     x1 = tf.keras.layers.Activation('softmax')(x1)

#     model = tf.keras.models.Model(inputs=[input_ids, attention_masks, segment_ids], outputs=[x1])
#     optimizer = tf.keras.optimizers.Adam(learning_rate= LEARNING_RATE_1)
#     model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])
#     return model, tokenizer

# #training
# model, tokenizer = build_model()
# train_input_ids, train_segment_ids, train_attention_masks = preprocess_text(tokenizer, train_data['q1'].values, train_data['q2'].values)
# model.fit([train_input_ids, train_attention_masks, train_segment_ids], one_hot_label, epochs=EPOCH_NUM, batch_size=BATCH_SIZE, callbacks=[callback], verbose=1)

# def give_batch(i1, i2, i3, n=1):
#     l = len(i1)
#     for index in range(0, l, n):
#         yield [i1[index:min(index + n, l)], i2[index:min(index + n, l)], i3[index:min(index + n, l)]]

# #testing
# test_outputs = []
# for batch in give_batch(test_input_ids, test_attention_masks, test_segment_ids, 50):
#     test_outputs.extend(model(batch, training=False))

# df_test['positive_score'] = np.array(test_outputs)[:,1]

In [None]:
# """
#   MuRIL
# """

# MAX_LEN = 256
# LEARNING_RATE_1 = 2e-5
# LEARNING_RATE_2 = 2e-5
# EPOCH_NUM = 5
# BATCH_SIZE = config.BATCH_SIZE

# def build_model_muril():
    
#     inputs = dict(
#       input_word_ids=tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32),
#       input_mask=tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32),
#       input_type_ids=tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32),
#     )

#     model_url="https://tfhub.dev/google/MuRIL/1"

#     muril_layer = hub.KerasLayer(model_url, trainable=True)
#     outputs = muril_layer(inputs)

#     vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
#     do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
#     tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

#     assert 'sequence_output' in outputs
#     assert 'pooled_output' in outputs
#     assert 'encoder_outputs' in outputs
#     assert 'default' in outputs

#     x = outputs["pooled_output"]
#     x1 = tf.keras.layers.Dropout(0.1)(x) 
#     x1 = tf.keras.layers.Dense(2)(x1)
#     x1 = tf.keras.layers.Activation('softmax')(x1)

#     model = tf.keras.models.Model(inputs=inputs, outputs=[x1])
#     optimizer = tf.keras.optimizers.Adam(learning_rate= LEARNING_RATE_1)
#     model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc'])


#     return model, tokenizer

# #training
# model, tokenizer = build_model_muril()
# train_input_ids, train_segment_ids, train_attention_masks = preprocess_text(tokenizer, train_data['q1'].values, train_data['q2'].values)
# inputs = dict(
#           input_word_ids=train_input_ids,
#           input_mask=train_segment_ids,
#           input_type_ids=train_attention_masks,
#         )
# model.fit(inputs, one_hot_label, epochs=EPOCH_NUM, batch_size=BATCH_SIZE, callbacks=[callback], verbose=1)

# def give_batch(i1, i2, i3, n=1):
#     l = len(i1)
#     for index in range(0, l, n):
#         yield dict(input_word_ids=i1[index:min(index + n, l)], input_mask=i2[index:min(index + n, l)], input_type_ids=i3[index:min(index + n, l)])

# #testing
# test_outputs = []
# for batch in give_batch(test_input_ids, test_attention_masks, test_segment_ids, 50):
#     test_outputs.extend(model(batch, training=False))
    
# df_test['positive_score'] = np.array(test_outputs)[:,1]

# Evaluation

In [None]:
"""
  The following cell contents is copied from calculate_performance.py
  The original python file will import the saved BERT model, predict and evaluate. We do not copy the model loading and prediction part
"""

import config

from IPython.display import display
TRAIN_COLUMN = 'Caller query transcription'
TEST_COLUMN = 'STT Transcript'
BROAD_THEME = 'Broad theme'

def performance_metric(df):

  average_precision = 0
  correct_answers = 0
  success_rate = [0,0,0,0,0]
  precision = [0,0,0,0,0]
  reciprocal_rank = 0
  
  for index,row in df.iterrows():
    query_question = row['q1']
    predicted_question = row['q2']
    
    query_question_answer_index = list(df_all_data[df_all_data[TEST_COLUMN] == query_question]['Answer Index'])[0]
    predicted_question_answer_index = list(df_all_data[df_all_data[TRAIN_COLUMN] == predicted_question]['Answer Index'])[0]

    if query_question_answer_index == predicted_question_answer_index:
      
      correct_answers += 1
      average_precision += correct_answers/(index + 1)
      for i in range(index,5):
        success_rate[i] = 1
        precision[i] += 1/(i + 1)

      if reciprocal_rank == 0:
        reciprocal_rank = 1/(index + 1)

  average_precision /= len(df)

  calculated_metric = {'SR@1' : success_rate[0], 'SR@3' : success_rate[2], 'SR@5' : success_rate[4], 
                       'P@1' : precision[0], 'P@3' : precision[2], 'P@5' : precision[4],
                       'MRR' : reciprocal_rank, 'MAP' : average_precision}
  return calculated_metric
  
calculated_metric = {'SR@1' : 0, 'SR@3' : 0, 'SR@5' : 0, 
                      'P@1' : 0, 'P@3' : 0, 'P@5' : 0,
                      'MRR' : 0, 'MAP' : 0}

calculated_metric_with_themes = {'SR@1' : 0, 'SR@3' : 0, 'SR@5' : 0, 
                      'P@1' : 0, 'P@3' : 0, 'P@5' : 0,
                      'MRR' : 0, 'MAP' : 0}

query_question_groups = df_test.groupby(['q1'])

for query_question in df_test['q1'].unique():
    group = query_question_groups.get_group(query_question)
    group['ai'] = [list(df_all_data[df_all_data[TRAIN_COLUMN] == ri]['Answer Index'])[0] for ri in list(group['q2'])]
    ai_groups = group.groupby(['ai'])

    for ans_i in group['ai'].unique():
      group_ai = ai_groups.get_group(ans_i)
      avg_score = group_ai['positive_score'].max() 
      group['positive_score'] = group.apply(lambda x: avg_score if x['ai'] == ans_i else x['positive_score'],  axis=1)
    group = group.drop_duplicates(subset=['ai'])

    
    query_question_theme = list(df_all_data[df_all_data[TEST_COLUMN] == query_question][BROAD_THEME])[0]
    group_with_themes = group.copy()
    
    for index, row in group_with_themes.iterrows():
        if query_question_theme != list(df_all_data[df_all_data[TRAIN_COLUMN] == row['q2']][BROAD_THEME])[0]:
            group_with_themes.loc[index, 'positive_score'] = 0
    
    group = group.sort_values(by=['positive_score'], ascending = False).reset_index(drop = True)
    group_with_themes = group_with_themes.sort_values(by=['positive_score'], ascending = False).reset_index(drop = True)
    group = group[group.index < 10]
    group_with_themes = group_with_themes[group_with_themes.index < 10]
    calculated_metric_for_group = performance_metric(group)
    calculated_metric_for_group_with_themes = performance_metric(group_with_themes)

    for key in calculated_metric_for_group:
      calculated_metric[key] += calculated_metric_for_group[key]
      calculated_metric_with_themes[key] += calculated_metric_for_group_with_themes[key]

calculated_metric['Hit@1'] = calculated_metric['SR@1'] 
calculated_metric['Hit@3'] = calculated_metric['SR@3']
calculated_metric['Hit@5'] = calculated_metric['SR@5']

calculated_metric_with_themes['Hit@1'] = calculated_metric_with_themes['SR@1']
calculated_metric_with_themes['Hit@3'] = calculated_metric_with_themes['SR@3']
calculated_metric_with_themes['Hit@5'] = calculated_metric_with_themes['SR@5']

for key in calculated_metric:
  if 'Hit' not in key: 
    calculated_metric[key] /= len(query_question_groups)
    calculated_metric_with_themes[key] /= len(query_question_groups)

print("Results without theme information : ")
print("Hit@1 : {}, 3: {}, 5 : {}, all : {}".format(calculated_metric['Hit@1'], calculated_metric['Hit@3'], calculated_metric['Hit@5'], len(df_test['q1'].unique())))
print("SR@1 : {:.3f}, 3: {:.3f}, 5 : {:.3f}".format(calculated_metric['SR@1'], calculated_metric['SR@3'], calculated_metric['SR@5']))
print("P@1 : {:.3f}, 3: {:.3f}, 5 : {:.3f}".format(calculated_metric['P@1'], calculated_metric['P@3'], calculated_metric['P@5']))

print("MAP : {:.3f}".format(calculated_metric['MAP']), end=", ")
print("MRR : {:.3f}".format(calculated_metric['MRR']))
# print("NDCG : {:.3f}".format(MDCG/deno_dd["Exist"]))

print("Results with theme information : ")
print("Hit@1 : {}, 3: {}, 5 : {}, all : {}".format(calculated_metric_with_themes['Hit@1'], calculated_metric_with_themes['Hit@3'], calculated_metric_with_themes['Hit@5'], len(df_test['q1'].unique())))
print("SR@1 : {:.3f}, 3: {:.3f}, 5 : {:.3f}".format(calculated_metric_with_themes['SR@1'], calculated_metric_with_themes['SR@3'], calculated_metric_with_themes['SR@5']))
print("P@1 : {:.3f}, 3: {:.3f}, 5 : {:.3f}".format(calculated_metric_with_themes['P@1'], calculated_metric_with_themes['P@3'], calculated_metric_with_themes['P@5']))

print("MAP : {:.3f}".format(calculated_metric_with_themes['MAP']), end=", ")
print("MRR : {:.3f}".format(calculated_metric_with_themes['MRR']), end=", ")