# Mount and module/library setup

In this step, the shortcut setup in your Google Drive will be mounted to this Google Colab Notebook in order for access. It is important the steps were followed correctly in the user guide and file names are not changed.

There will be a pop-up window in which Google Colab will request access to your Google Drive, this is normal and must be accepted to progress.

In [None]:
#Import google collab drive usage, mount then enter directory for file access.
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/adamcao-1906735-project/

#Install required libraries
!pip install -q tensorflow-ranking
!pip install -U tensorflow_text
!pip install -q tf-models-official==2.4.0
!pip install -U nltk

#Import all required modules and libraries
import pathlib
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_text as tf_text
from google.protobuf import text_format
import bz2
import json
import pandas as pd
import re
import random
from official.modeling import tf_utils
from official import nlp
from official.nlp import bert
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize


# Load NTCIR Collection

Below the data provided by the NTCIR-15 research task is loaded into the notebook. (Sometimes there is a delay in the drive being mounted, restart runtime to resolve this)

In [None]:
NTCIRCollectionPath = "NTCIRCollection/"

trainTopics = pd.read_csv(NTCIRCollectionPath + "data_search_e_train_topics.tsv",sep='\t', names=["QUERY ID", "QUERY STRING"]).reset_index()
trainQrels = pd.read_csv(NTCIRCollectionPath + "data_search_e_train_qrels.txt", sep=' ', header=None, names=["QUERY ID", "DOC_ID","REL_LEVEL"]).reset_index()
testTopics = pd.read_csv(NTCIRCollectionPath + "data_search_e_test_topics.tsv",sep='\t', names=["QUERY ID", "QUERY STRING"]).reset_index()
testQrels = pd.read_csv(NTCIRCollectionPath + "data_search_e_test_qrels.txt", sep=' ', header=None, names=["QUERY ID", "DOC_ID","REL_LEVEL"]).reset_index()
collectionPath = NTCIRCollectionPath + "data_search_e_collection.jsonl.bz2"

#Load collection data into a array of json.
collectionArr = []

with bz2.BZ2File(collectionPath) as file:
  for line in file:
      line = line.decode().strip()
      if line in {"[", "]"}:
          continue
      if line.endswith(","):
          line = line[:-1]
      collectionArr.append(json.loads(line))

In [None]:
#Peek at the data
for x,y in collectionArr[0].items():
  print(x)
  print(y)

print(trainTopics.head())
print(trainQrels.head())
print(testTopics.head())
print(testQrels.head())

# Data Splitting

To prepare the data for model training, the training topics provided in the NTCIR-15 collection are split into a 90/10 ratio of training and valiation data. The validation data conists of 'high value' data for the models to be evaluated upon that contain all three of the different relevancy scores.

In [None]:
#Split training topics into 90/10 Training/Validation set
queryDict = {}
for i1, r1 in trainTopics.iterrows():
  queryDict[r1['QUERY ID']] = {}
  queryDict[r1['QUERY ID']]['L0'] = 0
  queryDict[r1['QUERY ID']]['L1'] = 0
  queryDict[r1['QUERY ID']]['L2'] = 0
  for i2, r2 in trainQrels.iterrows():
    if r1['QUERY ID'] == r2['QUERY ID']:
        queryDict[r1['QUERY ID']][r2['REL_LEVEL']] += 1
  queryDict[r1['QUERY ID']]['total'] = queryDict[r1['QUERY ID']]['L0'] + queryDict[r1['QUERY ID']]['L1'] + queryDict[r1['QUERY ID']]['L2']

goodQueries = []
for x in queryDict.items():
  if x[1]['L2'] > 1 and x[1]['L1'] > 1 and x[1]['L0'] > 1:
    goodQueries.append(x)

valIDs = []

#Random 10% of Training set
for x in range(10):
  ranPick = random.choice(goodQueries)[0]
  while ranPick in valIDs: 
    ranPick = random.choice(goodQueries)[0]
  if ranPick not in valIDs:
    valIDs.append(ranPick)

print(valIDs)

trainTopicsSplitList = []
valTopicsSplitList = []

for x in trainTopics.iterrows():
  if x[1]['QUERY ID'] in valIDs:
    valTopicsSplitList.append(x[1])
  else:
    trainTopicsSplitList.append(x[1])

#Final Split loaded into Pandas DataFrames
valTopicsSplit = pd.DataFrame(valTopicsSplitList)
trainTopicsSplit = pd.DataFrame(trainTopicsSplitList)

# Summarizer Function

This function is used during the formatting process to shorten description text and only retain key information. This is so the BERT model can train on greater amounts of context information by truncating text that provides no context. Enabling more word tokens to be used within the max BERT sequence length.

In [None]:
# Input text - to summarize 
def summarizer(text):
  # Tokenizing the text
  stopWords = set(stopwords.words("english"))
  words = word_tokenize(text)
    
  # Creating a frequency table to keep the 
  # score of each word
  freqTable = dict()
  for word in words:
      word = word.lower()
      if word in stopWords:
          continue
      if word in freqTable:
          freqTable[word] += 1
      else:
          freqTable[word] = 1
    
  # Creating a dictionary to keep the score
  # of each sentence
  sentences = sent_tokenize(text)
  sentenceValue = dict()
    
  for sentence in sentences:
      for word, freq in freqTable.items():
          if word in sentence.lower():
              if sentence in sentenceValue:
                  sentenceValue[sentence] += freq
              else:
                  sentenceValue[sentence] = freq
    
  sumValues = 0
  for sentence in sentenceValue:
      sumValues += sentenceValue[sentence]
    
  # Average value of a sentence from the original text
  average = int(sumValues / len(sentenceValue))
    
  # Storing sentences into our summary.
  summary = ''
  for sentence in sentences:
      if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)):
          summary += " " + sentence

  summary = re.sub("[\(\[].*?[\)\]]", "", summary)

  return summary

# Title Feature Selection Formatting

The titleFormat function is run to format the loaded data into JSON files in preparation to be converted into ELWC format as BERT inputs.

This function converts the data only selecting the Title metadata for each dataset as the feature text for BERT to use. Data is also sorted into relevancy descending order for each ranking problem.

It results in three JSON files being produced:

* Training Set
* Validation Set
* Test Set

These files will be saved to the FormattedData folder.

A successful execution will print the following output:



```
Topic ID: DS1-E-0096 | 86 / 86 Topic Ranking Problems
Completed writing to FormattedData/TitleTrain.json
TitleTrain.json data formatting complete.


Topic ID: DS1-E-0080 | 10 / 10 Topic Ranking Problems
Completed writing to FormattedData/TitleVal.json
TitleVal.json data formatting complete.


Topic ID: DS1-E-1096 | 96 / 96 Topic Ranking Problems
Completed writing to FormattedData/TitleTest.json
TitleTest.json data formatting complete.
```




In [None]:
#Formatting into JSON using just Title in feature selection
def titleFormat(Topics, Qrels, folderOutput, filename):
  correctFormat = []
  NoDocIDs = len(Topics)
  counter = 1
  for i1, r1 in Topics.iterrows():
      print("\rTopic ID: " + r1['QUERY ID'] + " | " + str(counter) + " / " + str(NoDocIDs) + " Topic Ranking Problems", end="")
      counter += 1
      documents0 = []
      documents1 = []
      documents2 = []
      for i2, r2 in Qrels.iterrows():
        if r1['QUERY ID'] == r2['QUERY ID']:
          if type(r2['REL_LEVEL']) == int:
            relevance = r2['REL_LEVEL']
          else:
            relevance = int(r2['REL_LEVEL'].replace('L', ''))
          if relevance == 2:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:
                docText = data['title']
                documents2.append({ "relevance" : relevance, "docText": docText })
                
          elif relevance == 1:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:
                docText = data['title']
                documents1.append({ "relevance" : relevance, "docText": docText })

          elif relevance == 0:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:
                docText = data['title']
                documents0.append({ "relevance" : relevance, "docText": docText })

      documents = documents2 + documents1 + documents0
      if len(documents) != 0:
        correctFormat.append({ "queryText" : r1['QUERY STRING'], "documents" : documents})

  #Export
  JsonDict = {}
  JsonDict['rankingProblems'] = correctFormat
  JSON = json.dumps(JsonDict,indent=1)
  Output = folderOutput + filename

  with open(Output, 'w', encoding='utf-8') as f:
      json.dump(JsonDict, f, ensure_ascii=False, indent=1)

  print("")
  print("Completed writing to " + Output)
  print(filename + " data formatting complete.")
  print("\n")

titleFormat(trainTopicsSplit , trainQrels, "FormattedData/", "TitleTrain.json")
titleFormat(valTopicsSplit , trainQrels, "FormattedData/", "TitleVal.json")
titleFormat(testTopics , testQrels, "FormattedData/", "TitleTest.json")

# Title + Desc Feature Selection Formatting

The titleDescFormat function is run to format the loaded data into JSON files in preparation to be converted into ELWC format as BERT inputs.

This function converts the data selecting the Title and Description metadata for each dataset as the feature text for BERT to use. Data is also sorted into relevancy descending order for each ranking problem.

It results in three JSON files being produced:

* Training Set
* Validation Set
* Test Set

These files will be saved to the FormattedData folder.

A successful execution will print the following output:



```
Topic ID: DS1-E-0096 | 86 / 86 Topic Ranking Problems
Completed writing to FormattedData/TitleDescTrain.json
TitleDescTrain.json data formatting complete.


Topic ID: DS1-E-0080 | 10 / 10 Topic Ranking Problems
Completed writing to FormattedData/TitleDescVal.json
TitleDescVal.json data formatting complete.


Topic ID: DS1-E-1096 | 96 / 96 Topic Ranking Problems
Completed writing to FormattedData/TitleDescTest.json
TitleDescTest.json data formatting complete.
```




In [None]:
#Formatting into JSON using Title and Description in feature selection
def titleDescFormat(Topics, Qrels, folderOutput, filename):
  correctFormat = []
  NoDocIDs = len(Topics)
  counter = 1
  for i1, r1 in Topics.iterrows():
      print("\rTopic ID: " + r1['QUERY ID'] + " | " + str(counter) + " / " + str(NoDocIDs) + " Topic Ranking Problems", end="")
      counter += 1
      documents0 = []
      documents1 = []
      documents2 = []
      for i2, r2 in Qrels.iterrows():
        if r1['QUERY ID'] == r2['QUERY ID']:
          
          if type(r2['REL_LEVEL']) == int:
            relevance = r2['REL_LEVEL']
          else:
            relevance = int(r2['REL_LEVEL'].replace('L', ''))
          if relevance == 2:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:

                docText = ""
                if len(data['description']) == 0:
                  docText = data['title']
                else:
                  descText = summarizer(data['description']).replace('\n','')
                  if len(descText) < 1:
                    docText = data['title'] + " || " + data['description'].replace('\n','')
                  else:
                    docText = data['title'] + " || " + descText
                
                documents2.append({ "relevance" : relevance, "docText": docText })
          elif relevance == 1:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:

                docText = ""
                if len(data['description']) == 0:
                  docText = data['title']
                else:
                  descText = summarizer(data['description']).replace('\n','')
                  if len(descText) < 1:
                    docText = data['title'] + " || " + data['description'].replace('\n','')
                  else:
                    docText = data['title'] + " || " + descText
                
                documents1.append({ "relevance" : relevance, "docText": docText })

          elif relevance == 0:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:

                docText = ""
                if len(data['description']) == 0:
                  docText = data['title']
                else:
                  descText = summarizer(data['description']).replace('\n','')
                  if len(descText) < 1:
                    docText = data['title'] + " || " + data['description'].replace('\n','')
                  else:
                    docText = data['title'] + " || " + descText
                
                documents0.append({ "relevance" : relevance, "docText": docText })

      documents = documents2 + documents1 + documents0
      if len(documents) != 0:
        correctFormat.append({ "queryText" : r1['QUERY STRING'], "documents" : documents})

  #Export
  JsonDict = {}
  JsonDict['rankingProblems'] = correctFormat
  JSON = json.dumps(JsonDict,indent=1)
  Output = folderOutput + filename

  with open(Output, 'w', encoding='utf-8') as f:
      json.dump(JsonDict, f, ensure_ascii=False, indent=1)

  print("")
  print("Completed writing to " + Output)
  print(filename + " data formatting complete.")
  print("\n")

titleDescFormat(trainTopicsSplit , trainQrels, "FormattedData/", "TitleDescTrain.json")
titleDescFormat(valTopicsSplit , trainQrels, "FormattedData/", "TitleDescVal.json")
titleDescFormat(testTopics , testQrels, "FormattedData/", "TitleDescTest.json")

# Title + Tags Feature Selection Formatting

The titleTagsFormat function is run to format the loaded data into JSON files in preparation to be converted into ELWC format as BERT inputs.

This function converts the data selecting the Title and Tags metadata for each dataset as the feature text for BERT to use. Data is also sorted into relevancy descending order for each ranking problem.

It results in three JSON files being produced:

* Training Set
* Validation Set
* Test Set

These files will be saved to the FormattedData folder.

A successful execution will print the following output:



```
Topic ID: DS1-E-0096 | 86 / 86 Topic Ranking Problems
Completed writing to FormattedData/TitleTagsTrain.json
TitleTagsTrain.json data formatting complete.


Topic ID: DS1-E-0080 | 10 / 10 Topic Ranking Problems
Completed writing to FormattedData/TitleTagsVal.json
TitleTagsVal.json data formatting complete.


Topic ID: DS1-E-1096 | 96 / 96 Topic Ranking Problems
Completed writing to FormattedData/TitleTagsTest.json
TitleTagsTest.json data formatting complete.
```




In [None]:
#Formatting into JSON using Title and Tags in feature selection
def titleTagsFormat(Topics, Qrels, folderOutput, filename):
  correctFormat = []
  NoDocIDs = len(Topics)
  counter = 1
  for i1, r1 in Topics.iterrows():
      print("\rTopic ID: " + r1['QUERY ID'] + " | " + str(counter) + " / " + str(NoDocIDs) + " Topic Ranking Problems", end="")
      counter += 1
      documents0 = []
      documents1 = []
      documents2 = []
      for i2, r2 in Qrels.iterrows():
        if r1['QUERY ID'] == r2['QUERY ID']:

          if type(r2['REL_LEVEL']) == int:
            relevance = r2['REL_LEVEL']
          else:
            relevance = int(r2['REL_LEVEL'].replace('L', ''))
          if relevance == 2:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:
                tags = ""
                for x in data['data_fields']['tags']:
                  tags = tags + x + " "
                docText = data['title'] + ", " + tags
                documents2.append({ "relevance" : relevance, "docText": docText })
                
          elif relevance == 1:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:
                tags = ""
                for x in data['data_fields']['tags']:
                  tags = tags + x + " "
                docText = data['title'] + ", " + tags
                documents1.append({ "relevance" : relevance, "docText": docText })

          elif relevance == 0:
            for data in collectionArr:
              if r2['DOC_ID'] == data['id']:
                tags = ""
                for x in data['data_fields']['tags']:
                  tags = tags + x + " "
                docText = data['title'] + ", " + tags
                documents0.append({ "relevance" : relevance, "docText": docText })

      documents = documents2 + documents1 + documents0
      if len(documents) != 0:
        correctFormat.append({ "queryText" : r1['QUERY STRING'], "documents" : documents})

  #Export
  JsonDict = {}
  JsonDict['rankingProblems'] = correctFormat
  JSON = json.dumps(JsonDict,indent=1)
  Output = folderOutput + filename

  with open(Output, 'w', encoding='utf-8') as f:
      json.dump(JsonDict, f, ensure_ascii=False, indent=1)

  print("")
  print("Completed writing to " + Output)
  print(filename + " data formatting complete.")
  print("\n")

titleTagsFormat(trainTopicsSplit , trainQrels, "FormattedData/", "TitleTagsTrain.json")
titleTagsFormat(valTopicsSplit , trainQrels, "FormattedData/", "TitleTagsVal.json")
titleTagsFormat(testTopics , testQrels, "FormattedData/", "TitleTagsTest.json")

# View Max List Size

This function can be run to view the maximum list size of a ranking problem in the exported JSON files, this information is useful for setting the list size model training parameter.

In [None]:
def listSize(jsondir):
  f = open(jsondir)
  Data = json.load(f)
  f.close()
  docLens = []
  for x in Data['rankingProblems']:
    docLens.append( len(x['documents'])) 
  return max(docLens)

print(listSize('FormattedData/TitleTagsTrain.json'))
print(listSize('FormattedData/TitleTagsVal.json'))
print(listSize('FormattedData/TitleTagsTest.json'))

# JSON to ELWC Conversion

This takes the three exported JSON files and converts them from JSON into ELWC format as .tfrecords.

The sequence length must be set here, it will correspond to the BERT training sequence length when using it as data.

Example Output:

```
Utility to convert between JSON and ELWC for TFR-Bert

Model Parameters: 
Vocabulary filename: cased_L-12_H-768_A-12/vocab.txt
sequence_length: 256
do_lower_case: False
Input file:  FormattedData/TitleTagsTrain.json
Output file: FormattedData/TitleTagsData/256TrainELWC.tfrecord

Success. 

Utility to convert between JSON and ELWC for TFR-Bert

Model Parameters: 
Vocabulary filename: cased_L-12_H-768_A-12/vocab.txt
sequence_length: 256
do_lower_case: False
Input file:  FormattedData/TitleTagsVal.json
Output file: FormattedData/TitleTagsData/256ValELWC.tfrecord

Success. 

Utility to convert between JSON and ELWC for TFR-Bert

Model Parameters: 
Vocabulary filename: cased_L-12_H-768_A-12/vocab.txt
sequence_length: 256
do_lower_case: False
Input file:  FormattedData/TitleTagsTest.json
Output file: FormattedData/TitleTagsData/256TestELWC.tfrecord

Success. 
```



In [None]:
#For Title only feature selection
!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleTrain.json \
    --output_file=FormattedData/TitleData/256TrainELWC.tfrecord

!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleVal.json \
    --output_file=FormattedData/TitleData/256ValELWC.tfrecord

!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleTest.json \
    --output_file=FormattedData/TitleData/256TestELWC.tfrecord

In [None]:
#For Title + Desc only feature selection
!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleDescTrain.json \
    --output_file=FormattedData/TitleDescData/256TrainELWC.tfrecord

!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleDescVal.json \
    --output_file=FormattedData/TitleDescData/256ValELWC.tfrecord

!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleDescTest.json \
    --output_file=FormattedData/TitleDescData/256TestELWC.tfrecord

In [None]:
#For Title + Tags feature selection
!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleTagsTrain.json \
    --output_file=FormattedData/TitleTagsData/256TrainELWC.tfrecord

!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleTagsVal.json \
    --output_file=FormattedData/TitleTagsData/256ValELWC.tfrecord

!BERT_DIR="cased_L-12_H-768_A-12"  && \
python bertPython/tfrbert_convert_json_to_elwc.py \
    --vocab_file=${BERT_DIR}/vocab.txt \
    --sequence_length=256 \
    --input_file=FormattedData/TitleTagsTest.json \
    --output_file=FormattedData/TitleTagsData/256TestELWC.tfrecord