In [None]:
import os

In [None]:
os.environ['KAGGLE_USERNAME'] = "xxxxxx"
os.environ['KAGGLE_KEY'] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [None]:
!kaggle competitions download -c tensorflow2-question-answering

Downloading tensorflow2-question-answering.zip to /content
100% 4.47G/4.47G [04:25<00:00, 22.0MB/s]
100% 4.47G/4.47G [04:25<00:00, 18.1MB/s]


In [None]:
!unzip tensorflow2-question-answering.zip

Archive:  tensorflow2-question-answering.zip
  inflating: sample_submission.csv   
  inflating: simplified-nq-test.jsonl  
  inflating: simplified-nq-train.jsonl  


In [None]:
!pip install simpletransformers

In [None]:
import pandas as pd
import json
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
import re

import simpletransformers
from simpletransformers.classification import ClassificationModel
from simpletransformers.question_answering import QuestionAnsweringModel
from IPython.core.display import display, HTML

In [None]:

df = pd.read_csv('sample_submission.csv')
df.sample(10)

Unnamed: 0,example_id,PredictionString
262,-7430088119366486278_long,
106,-3375977262054555871_long,
668,8898704006223012351_long,
618,7826148994851450630_long,
244,-700658616128388515_long,
144,-4106074886122933774_long,
213,-6323397732231431355_short,
119,-3564167666631898036_short,
633,8241570808696998156_short,
540,5478445368933023882_long,


In [None]:
def line_to_json(line):
    return json.loads(line)


def get_question_and_document(line):
    question = line["question_text"]
    text = line["document_text"].split(" ")
    annotations = line["annotations"][0]

    return question, text, annotations


def get_long_candidate(i, annotations, candidate):
    # check if this candidate is the correct answer
    if i == annotations["long_answer"]["candidate_index"]:
        label = True
    else:
        label = False

    # get place where long answer starts and ends in the document text
    long_start = candidate["start_token"]
    long_end = candidate["end_token"]

    return label, long_start, long_end


def form_data_row(question, label, text, long_start, long_end):
    row = {
        "question": question,
        "long_answer": " ".join(text[long_start:long_end]),
        "is_long_answer": label,
    }

    return row


def load_data(file_path, questions_start, questions_end):
    rows = []

    with open(file_path) as file:
      for i, line in zip(tqdm(range(questions_start, questions_end)), islice(file, questions_start, questions_end)):
          json_doc = line_to_json(line)
          question, text, annotations = get_question_and_document(json_doc)

          for i, candidate in enumerate(json_doc["long_answer_candidates"]):
              label, long_start, long_end = get_long_candidate(
                  i, annotations, candidate
              )

              if label == True or (i % SAMPLE_RATE == 0):
                  rows.append(
                      form_data_row(question, label, text, long_start, long_end)
                  )

    return pd.DataFrame(rows)

In [None]:
TRAIN_PATH = "./simplified-nq-train.jsonl"
TRAINING_SIZE, VALIDATION_SIZE = 1000, 1000
SAMPLE_RATE = 15

# Read train and validation sets
train_df = load_data(TRAIN_PATH, 0, TRAINING_SIZE)
val_df = load_data(TRAIN_PATH, TRAINING_SIZE, TRAINING_SIZE+VALIDATION_SIZE)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
train_df['is_long_answer'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
is_long_answer,Unnamed: 1_level_1
False,0.949422
True,0.050578


In [None]:
val_df['is_long_answer'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
is_long_answer,Unnamed: 1_level_1
False,0.946577
True,0.053423


In [None]:
pd.set_option('display.max_colwidth', None)
train_df.sample(10)

Unnamed: 0,question,long_answer,is_long_answer
7856,who controled the house and senate in 2015,"<Li> December 18 , 2014 : Venezuela Defense of Human Rights and Civil Society Act of 2014 , Pub. L. 113 -- 278 </Li>",False
9729,who won the football league championship in 1968,<Tr> <Td> Crystal Palace ! Crystal Palace </Td> <Td> 0 -- 1 </Td> <Td> 0 -- 0 </Td> <Td> 1 -- 0 </Td> <Td> 3 -- 1 </Td> <Td> 0 -- 3 </Td> <Td> 2 -- 0 </Td> <Td> 2 -- 1 </Td> <Td> 1 -- 1 </Td> <Td> 3 -- 0 </Td> <Td> </Td> <Td> 1 -- 0 </Td> <Td> 0 -- 1 </Td> <Td> 0 -- 1 </Td> <Td> 1 -- 3 </Td> <Td> 1 -- 3 </Td> <Td> 2 -- 2 </Td> <Td> 6 -- 0 </Td> <Td> 5 -- 0 </Td> <Td> 2 -- 2 </Td> <Td> 2 -- 0 </Td> <Td> 1 -- 0 </Td> <Td> 1 -- 0 </Td> </Tr>,False
9106,who the girl in somebody that i used to know,"<Table> <Tr> <Th_colspan=""2""> `` Somebody That I Used to Know '' </Th> </Tr> <Tr> <Td_colspan=""2""> </Td> </Tr> <Tr> <Th_colspan=""2""> Single by Gotye featuring Kimbra </Th> </Tr> <Tr> <Th_colspan=""2""> from the album Making Mirrors </Th> </Tr> <Tr> <Th> Released </Th> <Td> 5 July 2011 ( 2011 - 07 - 05 ) </Td> </Tr> <Tr> <Th> Format </Th> <Td> CD single , digital download , 7 '' vinyl ( promotional only ) </Td> </Tr> <Tr> <Th> Recorded </Th> <Td> January -- May 2011 The Barn ( Merricks , Australia ) , Lucas Taranto 's lounge room ( Melbourne , Australia ) </Td> </Tr> <Tr> <Th> Genre </Th> <Td> Art pop </Td> </Tr> <Tr> <Th> Length </Th> <Td> 4 : 04 ( Album version ) 3 : 33 ( Radio mix ) </Td> </Tr> <Tr> <Th> Label </Th> <Td> Eleven </Td> </Tr> <Tr> <Th> Songwriter ( s ) </Th> <Td> Wally de Backer </Td> </Tr> <Tr> <Th> Producer ( s ) </Th> <Td> De Backer </Td> </Tr> <Tr> <Th_colspan=""2""> Gotye singles chronology </Th> </Tr> <Tr> <Td_colspan=""2""> <Table> <Tr> <Td> `` Eyes Wide Open '' ( 2010 ) </Td> <Td> `` Somebody That I Used to Know '' ( 2011 ) </Td> <Td> `` I Feel Better '' ( 2011 ) </Td> </Tr> </Table> </Td> </Tr> <Tr> <Td_colspan=""2""> <P> </P> <Table> <Tr> <Td> `` Eyes Wide Open '' ( 2010 ) </Td> <Td> `` Somebody That I Used to Know '' ( 2011 ) </Td> <Td> `` I Feel Better '' ( 2011 ) </Td> </Tr> </Table> </Td> </Tr> <Tr> <Td_colspan=""2""> </Td> </Tr> <Tr> <Th_colspan=""2""> Kimbra singles chronology </Th> </Tr> <Tr> <Td_colspan=""2""> <Table> <Tr> <Td> `` Cameo Lover '' ( 2011 ) Cameo Lover 2011 </Td> <Td> `` Somebody That I Used to Know '' ( 2011 ) Somebody That I Used to Know2011 </Td> <Td> `` Good Intent '' ( 2011 ) Good Intent 2011 </Td> </Tr> </Table> </Td> </Tr> <Tr> <Td_colspan=""2""> </Td> </Tr> </Table>",False
6982,who was the first indian who win world junior badminton championship,<Table> <Tr> <Td> <Table> <Tr> <Th> Singles </Th> <Th> Played </Th> <Th> Wins </Th> <Th> Losses </Th> <Th> Balance </Th> </Tr> <Tr> <Td> Total * </Td> <Td> 477 </Td> <Td> 339 </Td> <Td> 138 </Td> <Td> + 201 </Td> </Tr> <Tr> <Td> Current year ( 2016 ) * </Td> <Td> 24 </Td> <Td> 17 </Td> <Td> 8 </Td> <Td> + 10 </Td> </Tr> </Table> </Td> <Td> <Table> <Tr> <Th> Doubles </Th> <Th> Played </Th> <Th> Wins </Th> <Th> Losses </Th> <Th> Balance </Th> </Tr> <Tr> <Td> Total * </Td> <Td> 33 </Td> <Td> 9 </Td> <Td> 24 </Td> <Td> − 15 </Td> </Tr> <Tr> <Td> Current year ( 2016 ) * </Td> <Td> 0 </Td> <Td> 0 </Td> <Td> 0 </Td> <Td> 0 </Td> </Tr> </Table> </Td> </Tr> </Table>,False
245,when do the eclipse supposed to take place,"<Table> <Tr> <Th_colspan=""2""> Solar eclipse of August 21 , 2017 </Th> </Tr> <Tr> <Td_colspan=""2""> Totality as seen from Simpsonville , South Carolina </Td> </Tr> <Tr> <Td_colspan=""2""> Map </Td> </Tr> <Tr> <Th_colspan=""2""> Type of eclipse </Th> </Tr> <Tr> <Th> Nature </Th> <Td> Total </Td> </Tr> <Tr> <Th> Gamma </Th> <Td> 0.4367 </Td> </Tr> <Tr> <Th> Magnitude </Th> <Td> 1.0306 </Td> </Tr> <Tr> <Th_colspan=""2""> Maximum eclipse </Th> </Tr> <Tr> <Th> Duration </Th> <Td> 160 sec ( 2 m 40 s ) </Td> </Tr> <Tr> <Th> </Th> <Td> 37 ° 00 ′ N 87 ° 42 ′ W ﻿ / ﻿ 37 ° N 87.7 ° W ﻿ / 37 ; - 87.7 </Td> </Tr> <Tr> <Th> Max . width of band </Th> <Td> 115 km ( 71 mi ) </Td> </Tr> <Tr> <Th_colspan=""2""> Times ( UTC ) </Th> </Tr> <Tr> <Th> ( P1 ) Partial begin </Th> <Td> 15 : 46 : 48 </Td> </Tr> <Tr> <Th> ( U1 ) Total begin </Th> <Td> 16 : 48 : 32 </Td> </Tr> <Tr> <Th> Greatest eclipse </Th> <Td> 18 : 26 : 40 </Td> </Tr> <Tr> <Th> ( U4 ) Total end </Th> <Td> 20 : 01 : 35 </Td> </Tr> <Tr> <Th> ( P4 ) Partial end </Th> <Td> 21 : 04 : 19 </Td> </Tr> <Tr> <Th_colspan=""2""> References </Th> </Tr> <Tr> <Th> Saros </Th> <Td> 145 ( 22 of 77 ) </Td> </Tr> <Tr> <Th> Catalog # ( SE5000 ) </Th> <Td> 9546 </Td> </Tr> </Table>",True
3703,how much money does argentina make from tourism,"<P> As of June 2015 , the government said that inflation was at 15.3 % ; approximately half that of some independent estimates . Inflation remained at around 18.6 % in 2015 according to an IMF estimate ; but following a sharp devaluation enacted by the Mauricio Macri administration on December 17 , inflation reignited during the first half of 2016 - reaching 42 % according to the Finance Ministry . </P>",False
78,what indian tribe did the acadians form friendships and alliances with,"<Li> 1859 , the first history of Acadia is published in French by Edme Rameau de Saint - Père ; Acadians begin to become aware of their own existence </Li>",False
5898,picture of the presidents of the united states,<Ul> <Li> Death of Richard Nixon </Li> <Li> Death of Spiro Agnew </Li> </Ul>,False
4935,michael jackson the best artist of all time,<Tr> <Td> <Dl> <Dt> World Music Awards </Dt> </Dl> </Td> <Td> 16 </Td> </Tr>,False
5852,picture of the presidents of the united states,<Ul> <Li> − 05 </Li> <Li> + 06 </Li> </Ul>,False


In [None]:
train_df['question'].values[10]

'what type of fertilisation takes place in humans'

In [None]:
text = train_df['long_answer'].values[10]
text

'<Tr> <Td> <Ul> <Li> Attachment </Li> <Li> Ecological </Li> <Li> Psychosocial </Li> <Li> Psychosexual development </Li> <Li> Moral </Li> <Li> Cognitive </Li> <Li> Cultural - historical </Li> <Li> Evolutionary </Li> </Ul> </Td> </Tr>'

In [None]:
cleaned_text = re.sub(r"<[^>]*>", "", text)

# Optionally, strip any leading/trailing whitespace
cleaned_text = cleaned_text.strip()

# Display the cleaned text
print(cleaned_text)

Attachment   Ecological   Psychosocial   Psychosexual development   Moral   Cognitive   Cultural - historical   Evolutionary


In [None]:
cleaned_text = re.sub(r"<[^>]+>", "", text)
print(cleaned_text.strip())  # Output: "delete"


Attachment   Ecological   Psychosocial   Psychosexual development   Moral   Cognitive   Cultural - historical   Evolutionary


In [None]:
def remove_html(text):
    """Remove html tags from a string"""
    cleaned_text = re.sub(r"<[^>]+>", "", text)
    return cleaned_text.strip()

In [None]:
train_df['long_answer'].apply(remove_html)

Unnamed: 0,long_answer
0,( hide ) This article has multiple issues . Please help improve it or discuss these issues on the talk page . ( Learn how and when to remove these template messages ) This article needs additional citations for verification . Please help improve this article by adding citations to reliable sources . Unsourced material may be challenged and removed . ( September 2014 ) ( Learn how and when to remove this template message ) This article possibly contains original research . Please improve it by verifying the claims made and adding inline citations . Statements consisting only of original research should be removed . ( January 2015 ) ( Learn how and when to remove this template message ) ( Learn how and when to remove this template message )
1,Pay - per - click Cost per impression Search analytics Web analytics
2,"Email marketing has evolved rapidly alongside the technological growth of the 21st century . Prior to this growth , when emails were novelties to the majority of customers , email marketing was not as effective . In 1978 , Gary Thuerk of Digital Equipment Corporation ( DEC ) sent out the first mass email to approximately 400 potential clients via the Advanced Research Projects Agency Network ( ARPANET ) . This email resulted in $13 million worth of sales in DEC products , and highlighted the potential of marketing through mass emails . However , as email marketing developed as an effective means of direct communication , users began blocking out content from emails with filters and blocking programs . In order to effectively communicate a message through email , marketers had to develop a way of pushing content through to the end user , without being cut out by automatic filters and spam removing software . This resulted in the birth of triggered marketing emails , which are sent to specific users based on their tracked online browsing patterns ."
3,"Advertisers can reach substantial numbers of email subscribers who have opted in ( i.e. , consented ) to receive the email ."
4,"A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of purchase if they would like to receive the newsletter ."
...,...
10197,"`` I Wanna Talk About Me '' Single by Toby Keith from the album Pull My Chain Released August 20 , 2001 Format CD single Recorded 2000 Genre Country , country rap Length 3 : 04 Label DreamWorks Songwriter ( s ) Bobby Braddock Producer ( s ) James Stroud Toby Keith Toby Keith singles chronology `` I 'm Just Talkin ' About Tonight '' ( 2001 ) `` I Wanna Talk About Me '' ( 2001 ) `` My List '' ( 2002 ) `` I 'm Just Talkin ' About Tonight '' ( 2001 ) `` I Wanna Talk About Me '' ( 2001 ) `` My List '' ( 2002 )"
10198,`` I 'm Just Talkin ' About Tonight '' ( 2001 ) `` I Wanna Talk About Me '' ( 2001 ) `` My List '' ( 2002 )
10199,Chart ( 2001 ) Position US Country Songs ( Billboard ) 60
10200,`` An apple a day keeps the doctor away '' is a common English - language proverb of Welsh origin . It espouses the folk - wisdom that apple consumption ( or consumption of fruits and vegetables in general ) has identifiable health benefits .


In [None]:
train_df['long_answer_cleaned'] = train_df['long_answer'].apply(remove_html)
train_df.sample(3)

Unnamed: 0,question,long_answer,is_long_answer,long_answer_cleaned
9918,when is the station of the cross done,"<Li> <P> 1st Station , Jesus is condemned to death </P> </Li>",False,"1st Station , Jesus is condemned to death"
1340,who does oregon state play in the college world series,"<Table> <Tr> <Th_colspan=""2""> College World Series </Th> </Tr> <Tr> <Td_colspan=""2""> </Td> </Tr> <Tr> <Th> First played </Th> <Td> 1947 </Td> </Tr> <Tr> <Th> Most recently played </Th> <Td> 2018 </Td> </Tr> <Tr> <Th> Current champions </Th> <Td> Oregon State ( 3rd title ) </Td> </Tr> <Tr> <Th> Current runner - up </Th> <Td> Arkansas </Td> </Tr> <Tr> <Th> Most titles </Th> <Td> USC ( 12 ) </Td> </Tr> </Table>",False,College World Series First played 1947 Most recently played 2018 Current champions Oregon State ( 3rd title ) Current runner - up Arkansas Most titles USC ( 12 )
2255,who changed the title of his work after the inquisition deemed it offensive,"<P> If the sentence was condemnatory , this implied that the condemned had to participate in the ceremony of an auto de fe ( more commonly known in English as an auto - da - fé ) that solemnized their return to the Church ( in most cases ) , or punishment as an impenitent heretic . The autos - da - fé could be private ( auto particular ) or public ( auto publico or auto general ) . </P>",False,"If the sentence was condemnatory , this implied that the condemned had to participate in the ceremony of an auto de fe ( more commonly known in English as an auto - da - fé ) that solemnized their return to the Church ( in most cases ) , or punishment as an impenitent heretic . The autos - da - fé could be private ( auto particular ) or public ( auto publico or auto general ) ."


In [None]:
model_args = {
    "num_train_epochs": 2,
    "train_batch_size": 16
    }

import torch
use_cuda = torch.cuda.is_available()

model= ClassificationModel(
    "bert",
    "bert-base-cased",
    args=model_args,
    use_cuda=use_cuda
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
model.get_named_parameters()

['bert.embeddings.word_embeddings.weight',
 'bert.embeddings.position_embeddings.weight',
 'bert.embeddings.token_type_embeddings.weight',
 'bert.embeddings.LayerNorm.weight',
 'bert.embeddings.LayerNorm.bias',
 'bert.encoder.layer.0.attention.self.query.weight',
 'bert.encoder.layer.0.attention.self.query.bias',
 'bert.encoder.layer.0.attention.self.key.weight',
 'bert.encoder.layer.0.attention.self.key.bias',
 'bert.encoder.layer.0.attention.self.value.weight',
 'bert.encoder.layer.0.attention.self.value.bias',
 'bert.encoder.layer.0.attention.output.dense.weight',
 'bert.encoder.layer.0.attention.output.dense.bias',
 'bert.encoder.layer.0.attention.output.LayerNorm.weight',
 'bert.encoder.layer.0.attention.output.LayerNorm.bias',
 'bert.encoder.layer.0.intermediate.dense.weight',
 'bert.encoder.layer.0.intermediate.dense.bias',
 'bert.encoder.layer.0.output.dense.weight',
 'bert.encoder.layer.0.output.dense.bias',
 'bert.encoder.layer.0.output.LayerNorm.weight',
 'bert.encoder.layer

In [None]:
model.model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
train_df['text_to_classify'] = train_df['question'] + ' [SEP] ' + train_df['long_answer_cleaned']
train_df.sample(3)

Unnamed: 0,question,long_answer,is_long_answer,long_answer_cleaned,text_to_classify
7913,who controled the house and senate in 2015,"<Li> Competitiveness , Innovation , and Export Promotion : Amy Klobuchar , Roy Blunt </Li>",False,"Competitiveness , Innovation , and Export Promotion : Amy Klobuchar , Roy Blunt","who controled the house and senate in 2015 [SEP] Competitiveness , Innovation , and Export Promotion : Amy Klobuchar , Roy Blunt"
5873,picture of the presidents of the united states,<Ul> <Li> + 22 </Li> <Li> − 15 </Li> </Ul>,False,+ 22 − 15,picture of the presidents of the united states [SEP] + 22 − 15
2071,when did the song old red come out,"<Li> <P> The Flag of the People 's Republic of China . Red symbolizes revolution , the large star is the Communist Party , and the smaller stars represent the working class , the peasants , and the urban middle class , the rural middle class , as described by Mao Zedong . </P> </Li>",False,"The Flag of the People 's Republic of China . Red symbolizes revolution , the large star is the Communist Party , and the smaller stars represent the working class , the peasants , and the urban middle class , the rural middle class , as described by Mao Zedong .","when did the song old red come out [SEP] The Flag of the People 's Republic of China . Red symbolizes revolution , the large star is the Communist Party , and the smaller stars represent the working class , the peasants , and the urban middle class , the rural middle class , as described by Mao Zedong ."


In [None]:
train_df['is_long_answer'] = train_df['is_long_answer'].replace({True: 1, False: 0})

In [None]:
train_df['is_long_answer'].value_counts()

Unnamed: 0_level_0,count
is_long_answer,Unnamed: 1_level_1
0,9686
1,516


In [None]:
train_df.columns

Index(['question', 'long_answer', 'is_long_answer', 'long_answer_cleaned',
       'text_to_classify'],
      dtype='object')

In [None]:
df_train = train_df[['text_to_classify', 'is_long_answer']]
df_train

Unnamed: 0,text_to_classify,is_long_answer
0,which is the most common use of opt-in e-mail marketing [SEP] ( hide ) This article has multiple issues . Please help improve it or discuss these issues on the talk page . ( Learn how and when to remove these template messages ) This article needs additional citations for verification . Please help improve this article by adding citations to reliable sources . Unsourced material may be challenged and removed . ( September 2014 ) ( Learn how and when to remove this template message ) This article possibly contains original research . Please improve it by verifying the claims made and adding inline citations . Statements consisting only of original research should be removed . ( January 2015 ) ( Learn how and when to remove this template message ) ( Learn how and when to remove this template message ),0
1,which is the most common use of opt-in e-mail marketing [SEP] Pay - per - click Cost per impression Search analytics Web analytics,0
2,"which is the most common use of opt-in e-mail marketing [SEP] Email marketing has evolved rapidly alongside the technological growth of the 21st century . Prior to this growth , when emails were novelties to the majority of customers , email marketing was not as effective . In 1978 , Gary Thuerk of Digital Equipment Corporation ( DEC ) sent out the first mass email to approximately 400 potential clients via the Advanced Research Projects Agency Network ( ARPANET ) . This email resulted in $13 million worth of sales in DEC products , and highlighted the potential of marketing through mass emails . However , as email marketing developed as an effective means of direct communication , users began blocking out content from emails with filters and blocking programs . In order to effectively communicate a message through email , marketers had to develop a way of pushing content through to the end user , without being cut out by automatic filters and spam removing software . This resulted in the birth of triggered marketing emails , which are sent to specific users based on their tracked online browsing patterns .",0
3,"which is the most common use of opt-in e-mail marketing [SEP] Advertisers can reach substantial numbers of email subscribers who have opted in ( i.e. , consented ) to receive the email .",0
4,"which is the most common use of opt-in e-mail marketing [SEP] A common example of permission marketing is a newsletter sent to an advertising firm 's customers . Such newsletters inform customers of upcoming events or promotions , or new products . In this type of advertising , a company that wants to send a newsletter to their customers may ask them at the point of purchase if they would like to receive the newsletter .",1
...,...,...
10197,"toby keith - i wanna talk about me [SEP] `` I Wanna Talk About Me '' Single by Toby Keith from the album Pull My Chain Released August 20 , 2001 Format CD single Recorded 2000 Genre Country , country rap Length 3 : 04 Label DreamWorks Songwriter ( s ) Bobby Braddock Producer ( s ) James Stroud Toby Keith Toby Keith singles chronology `` I 'm Just Talkin ' About Tonight '' ( 2001 ) `` I Wanna Talk About Me '' ( 2001 ) `` My List '' ( 2002 ) `` I 'm Just Talkin ' About Tonight '' ( 2001 ) `` I Wanna Talk About Me '' ( 2001 ) `` My List '' ( 2002 )",0
10198,toby keith - i wanna talk about me [SEP] `` I 'm Just Talkin ' About Tonight '' ( 2001 ) `` I Wanna Talk About Me '' ( 2001 ) `` My List '' ( 2002 ),0
10199,toby keith - i wanna talk about me [SEP] Chart ( 2001 ) Position US Country Songs ( Billboard ) 60,0
10200,where does an apple a day keeps the doctor away come from [SEP] `` An apple a day keeps the doctor away '' is a common English - language proverb of Welsh origin . It espouses the folk - wisdom that apple consumption ( or consumption of fruits and vegetables in general ) has identifiable health benefits .,0


In [None]:
val_df['long_answer'].apply(remove_html)
val_df['long_answer_cleaned'] = val_df['long_answer'].apply(remove_html)

val_df['text_to_classify'] = val_df['question'] + ' [SEP] ' + val_df['long_answer_cleaned']

val_df['is_long_answer'] = val_df['is_long_answer'].replace({True: 1, False: 0})

df_val = val_df[['text_to_classify', 'is_long_answer']]
df_val.sample(3)

Unnamed: 0,text_to_classify,is_long_answer
3315,"the hawaiian pīʻāpā consists of the following letters of the alphabet [SEP] Hawaiian alphabet Type Alphabet Languages Hawaiian language Creator American Protestant missionaries Time period 1822 -- present This article contains IPA phonetic symbols . Without proper rendering support , you may see question marks , boxes , or other symbols instead of Unicode characters . For an introductory guide on IPA symbols , see Help : IPA .",0
5437,"when is season four of the 100 coming out on netflix [SEP] 10 10 `` I Am Become Death '' Omar Madha T.J. Brady & Rasheed Newson May 21 , 2014 ( 2014 - 05 - 21 ) 2J7060 1.46",0
2448,which constitutional amendment introduced the right of due process answers.com [SEP] Congressional Apportionment Titles of Nobility Corwin Child Labor Equal Rights D.C. Voting Rights,0


In [None]:
model.train_model(df_train)

  self.pid = os.fork()


  0%|          | 0/20 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/638 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/638 [00:00<?, ?it/s]

(1276, 0.15193941803841754)

In [None]:
# model.eval_model(df_val)

  self.pid = os.fork()


  0%|          | 0/19 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/98 [00:00<?, ?it/s]

({'mcc': 0.4129338283967381,
  'accuracy': 0.9429945757854876,
  'f1_score': 0.7062020765305024,
  'tp': 221,
  'tn': 8993,
  'fp': 256,
  'fn': 301,
  'auroc': 0.9059980182179787,
  'auprc': 0.44064075654762924,
  'eval_loss': 0.19986175046283372},
 array([[-1.75      ,  2.0546875 ],
        [ 3.58203125, -3.18359375],
        [ 0.60351562, -0.66064453],
        ...,
        [ 2.02148438, -1.6171875 ],
        [ 1.60351562, -1.3671875 ],
        [ 2.6484375 , -2.5625    ]]),
 [{'guid': 0, 'text_a': "what type of oxide are formed when metal combine with oxygen [SEP] An oxide / ˈɒksaɪd / is a chemical compound that contains at least one oxygen atom and one other element in its chemical formula . `` Oxide '' itself is the dianion of oxygen , an O atom . Metal oxides thus typically contain an anion of oxygen in the oxidation state of − 2 . Most of the Earth 's crust consists of solid oxides , the result of elements being oxidized by the oxygen in air or in water . Hydrocarbon combustion a

In [None]:
val_performance, model_outputs, wrong_predictions = model.eval_model(df_val)

  self.pid = os.fork()


  0%|          | 0/19 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/98 [00:00<?, ?it/s]

In [None]:
val_performance

{'mcc': 0.4054445713678824,
 'accuracy': 0.9491351959881281,
 'f1_score': 0.6967355677823923,
 'tp': 180,
 'tn': 9094,
 'fp': 155,
 'fn': 342,
 'auroc': 0.9094433528901749,
 'auprc': 0.4215133232102052,
 'eval_loss': 0.1549128293991089}

In [None]:
def get_line_of_data(file):
    line = file.readline()
    line = json.loads(line)
    return line

def load_test_data(file_path, questions_start, questions_end):
    rows = []

    with open(file_path) as file:

        for i in tqdm(range(questions_start, questions_end)):
            paragraphs = []
            line = get_line_of_data(file)
            question, text = line['question_text'], line['document_text'].split(' ')

            for i, candidate in enumerate(line['long_answer_candidates']):
              long_start = candidate['start_token']
              long_end = candidate['end_token']
              paragraphs.append(' '.join(text[long_start:long_end]))

            rows.append({'question': question, 'text': paragraphs})

    return pd.DataFrame(rows)

In [None]:
TEST_PATH = "./simplified-nq-test.jsonl"
TEST_SIZE = 1000
SAMPLE_RATE = 15

SUBMISSION_PATH = "./simplified-nq-test.jsonl"
test_df = load_test_data(TEST_PATH, 0, 10)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
def remove_html_tags(text):
    """Remove HTML tags from a string and return the cleaned text."""
    # Remove HTML tags
    cleaned_text = re.sub(r'<.*?>', '', text)
    # Remove remaining brackets and clean up extra spaces/newlines
    cleaned_text = re.sub(r'[\[\]]', '', cleaned_text).strip()
    return cleaned_text

In [None]:
test_df['text'].values[0]

['<Table> <Tr> <Th_colspan="2"> High Commission of South Africa in London </Th> </Tr> <Tr> <Td_colspan="2"> </Td> </Tr> <Tr> <Th> Location </Th> <Td> Trafalgar Square , London </Td> </Tr> <Tr> <Th> Address </Th> <Td> Trafalgar Square , London , WC2N 5DP </Td> </Tr> <Tr> <Th> Coordinates </Th> <Td> 51 ° 30 ′ 30 \'\' N 0 ° 07 ′ 37 \'\' W \ufeff / \ufeff 51.5082 ° N 0.1269 ° W \ufeff / 51.5082 ; - 0.1269 Coordinates : 51 ° 30 ′ 30 \'\' N 0 ° 07 ′ 37 \'\' W \ufeff / \ufeff 51.5082 ° N 0.1269 ° W \ufeff / 51.5082 ; - 0.1269 </Td> </Tr> <Tr> <Th> High Commissioner </Th> <Td> Vacant </Td> </Tr> </Table>',
 '<Tr> <Th_colspan="2"> High Commission of South Africa in London </Th> </Tr>',
 '<Tr> <Th> Location </Th> <Td> Trafalgar Square , London </Td> </Tr>',
 '<Tr> <Th> Address </Th> <Td> Trafalgar Square , London , WC2N 5DP </Td> </Tr>',
 "<Tr> <Th> Coordinates </Th> <Td> 51 ° 30 ′ 30 '' N 0 ° 07 ′ 37 '' W \ufeff / \ufeff 51.5082 ° N 0.1269 ° W \ufeff / 51.5082 ; - 0.1269 Coordinates : 51 ° 30 ′

In [None]:
test_df['text_cleaned'] = test_df['text'].apply(lambda x: ' '.join([remove_html_tags(item) for item in x]))
test_df['text_cleaned'][0]

"High Commission of South Africa in London         Location   Trafalgar Square , London     Address   Trafalgar Square , London , WC2N 5DP     Coordinates   51 ° 30 ′ 30 '' N 0 ° 07 ′ 37 '' W \ufeff / \ufeff 51.5082 ° N 0.1269 ° W \ufeff / 51.5082 ; - 0.1269 Coordinates : 51 ° 30 ′ 30 '' N 0 ° 07 ′ 37 '' W \ufeff / \ufeff 51.5082 ° N 0.1269 ° W \ufeff / 51.5082 ; - 0.1269     High Commissioner   Vacant High Commission of South Africa in London Location   Trafalgar Square , London Address   Trafalgar Square , London , WC2N 5DP Coordinates   51 ° 30 ′ 30 '' N 0 ° 07 ′ 37 '' W \ufeff / \ufeff 51.5082 ° N 0.1269 ° W \ufeff / 51.5082 ; - 0.1269 Coordinates : 51 ° 30 ′ 30 '' N 0 ° 07 ′ 37 '' W \ufeff / \ufeff 51.5082 ° N 0.1269 ° W \ufeff / 51.5082 ; - 0.1269 High Commissioner   Vacant The High Commission of South Africa in London is the diplomatic mission from South Africa to the United Kingdom . It is located at South Africa House , a building on Trafalgar Square , London . As well as cont

In [None]:
test_df['text_to_classify'] = test_df['question'] + ' [SEP] ' + test_df['text_cleaned']
df_test = test_df['text_to_classify']
df_test.sample(3)

Unnamed: 0,text_to_classify
6,"what are the minds two tracks and what is dual processing [SEP] In psychology , a dual process theory provides an account of how thought can arise in two different ways , or as a result of two different processes . Often , the two processes consist of an implicit ( automatic ) , unconscious process and an explicit ( controlled ) , conscious process . Verbalized explicit processes or attitudes and actions may change with persuasion or education ; though implicit process or attitudes usually take a long amount of time to change with the forming of new habits . Dual process theories can be found in social , personality , cognitive , and clinical psychology . It has also been linked with economics via prospect theory and behavioral economics , and increasingly in sociology through cultural analysis . The foundations of dual process theory likely comes from William James . He believed that there were two different kinds of thinking : associative and true reasoning . James theorized that..."
8,"when did they finish building the sydney opera house [SEP] Sydney Opera House Location of Sydney Opera House Sydney Opera House ( New South Wales ) Sydney Opera House ( Australia ) Show map of Sydney Show map of New South Wales Show map of Australia Show all General information Status Complete Type Performing arts centre Architectural style Expressionist Location Bennelong Point , Sydney Country Australia Coordinates 33 ° 51 ′ 31 '' S 151 ° 12 ′ 51 '' E ﻿ / ﻿ 33.85861 ° S 151.21417 ° E ﻿ / - 33.85861 ; 151.21417 Coordinates : 33 ° 51 ′ 31 '' S 151 ° 12 ′ 51 '' E ﻿ / ﻿ 33.85861 ° S 151.21417 ° E ﻿ / - 33.85861 ; 151.21417 Elevation 4 m ( 13 ft ) Current tenants Opera Australia The Australian Ballet Sydney Theatre Company Sydney Symphony Orchestra ( + others ) Groundbreaking 1 March 1959 Construction started 1 March 1959 Completed Opened 20 October 1973 Inaugurated 20 October 1973 ..."
1,"the office episode when they sing to michael [SEP] `` Michael 's Last Dundies '' The Office episode Episode no . Season 7 Episode 21 Directed by Mindy Kaling Written by Mindy Kaling Production code 7021 Original air date April 21 , 2011 Running time 22 minutes Guest appearance ( s ) Will Ferrell as Deangelo Vickers Jack Coleman as State Sen. Robert Lipton Episode chronology ← Previous `` Training Day '' Next → `` Goodbye , Michael '' The Office ( U.S. season 7 ) List of The Office ( U.S. TV series ) episodes `` Michael 's Last Dundies '' The Office episode Episode no . Season 7 Episode 21 Directed by Mindy Kaling Written by Mindy Kaling Production code 7021 Original air date April 21 , 2011 Running time 22 minutes Guest appearance ( s ) Will Ferrell as Deangelo Vickers Jack Coleman as State Sen. Robert Lipton Will Ferrell as Deangelo Vickers Jack Coleman as State Sen. Robert Lipton Will Ferrel..."


In [None]:
IDX = np.random.randint(1,10)
question, paragraphs = test_df.iloc[IDX].question, test_df.iloc[IDX].text
question

'where was the first sample of ascorbic acid isolated from'

In [None]:
paragraphs

['<Table> L - Ascorbic acid <Tr> <Td_colspan="2"> </Td> </Tr> <Tr> <Td_colspan="2"> </Td> </Tr> <Tr> <Th_colspan="2"> Names </Th> </Tr> <Tr> <Td_colspan="2"> IUPAC name ( 5R ) - ( ( 1S ) - 1 , 2 - Dihydroxyethyl ) - 3 , 4 - dihydroxyfuran - 2 ( 5H ) - one </Td> </Tr> <Tr> <Td_colspan="2"> Other names Vitamin C </Td> </Tr> <Tr> <Th_colspan="2"> Identifiers </Th> </Tr> <Tr> <Td> CAS Number </Td> <Td> <Ul> <Li> 50 - 81 - 7 </Li> </Ul> </Td> </Tr> <Tr> <Td> 3D model ( JSmol ) </Td> <Td> <Ul> <Li> Interactive image </Li> <Li> Interactive image </Li> </Ul> </Td> </Tr> <Tr> <Td> ChEBI </Td> <Td> <Ul> <Li> CHEBI : 29073 </Li> </Ul> </Td> </Tr> <Tr> <Td> ChEMBL </Td> <Td> <Ul> <Li> ChEMBL196 </Li> </Ul> </Td> </Tr> <Tr> <Td> ChemSpider </Td> <Td> <Ul> <Li> 10189562 </Li> </Ul> </Td> </Tr> <Tr> <Td> EC Number </Td> <Td> 200 - 066 - 2 </Td> </Tr> <Tr> <Td> IUPHAR / BPS </Td> <Td> <Ul> <Li> 4781 </Li> </Ul> </Td> </Tr> <Tr> <Td> KEGG </Td> <Td> <Ul> <Li> D00018 </Li> </Ul> </Td> </Tr> <Tr> <Td> Pu

In [None]:
answers, output = model.predict(text_to_classify)

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
answers

array([0, 0, 0, ..., 1, 1, 1])

In [None]:
output

array([[ 2.16406250e+000, -2.40625000e+000],
       [ 7.91992188e-001, -1.22558594e+000],
       [ 3.93798828e-001, -8.27148438e-001],
       ...,
       [ 5.04992300e-310,  6.61362428e-310],
       [ 5.04992300e-310,  6.61362428e-310],
       [ 5.04992300e-310,  6.61362428e-310]])

#### Finally some results

In [None]:
print("Question was: {}".format(question))
print("---")
print("Predicted long answer is:")
for answer_idx in np.where(answers==1)[0]:
  display(HTML(paragraphs[answer_idx]))

Question was: where was the first sample of ascorbic acid isolated from
---
Predicted long answer is:


IndexError: list index out of range

---
### Now using pretrained QA model

In [None]:
def line_to_json(line):
    return json.loads(line)

def get_short_answer(annotations, long_start, long_end):
    if len(annotations['short_answers']) > 0:
        short_start = annotations['short_answers'][0]['start_token']
        short_end = annotations['short_answers'][0]['end_token']

        short_start = short_start - long_start
        short_end = short_end - long_start

        return short_start, short_end
    else:
        return 0, 0


def form_short_data_row(question, text, long_start, long_end, short_start, short_end):
    long_answer = ' '.join(text[long_start:long_end])
    short_answer = ' '.join(long_answer.split(' ')[short_start:short_end])

    row = {
        'question': question,
        'long_answer': long_answer,
        'short_answer': short_answer,
        'short_start': short_start,
        'short_end': short_end
    }

    return row


def load_short_data(file_path, questions_start, questions_end):
    rows = []
    with open(file_path) as file:
        for i, lline in zip(tqdm(range(questions_start, questions_end)), islice(file, questions_start, questions_end)):
            line = line_to_json(lline)
            question, text, annotations = get_question_and_document(line)
            for i, candidate in enumerate(line['long_answer_candidates']):
                label, long_start, long_end = get_long_candidate(i, annotations, candidate)
                if label == True:
                    short_start, short_end = get_short_answer(annotations, long_start, long_end)
                    rows.append(
                        form_short_data_row(question, text, long_start, long_end, short_start, short_end)
                    )
    return pd.DataFrame(rows)

In [None]:
TRAIN_PATH = "/content/simplified-nq-train.jsonl"
TRAINING_SIZE, VALIDATION_SIZE = 1000, 1000

train_short_df = load_short_data(TRAIN_PATH, 0, TRAINING_SIZE)
val_short_df = load_short_data(TRAIN_PATH, TRAINING_SIZE, TRAINING_SIZE+VALIDATION_SIZE)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
def reformat_qa_data(data):

  data["is_impossible"] = data.short_end==0
  data["context"] = data.long_answer.apply(lambda x: remove_html(x))

  train_data = []
  for idx, row in data.iterrows():
    if row.is_impossible:
      train_data.append(
              {
          "context": row.context,
          "qas": [
                   {
                      "id": str(idx),
                      "is_impossible": row.is_impossible,
                      "question": row.question,
                      "answers": [],
                  }
                ],
              },
      )
    else:
      train_data.append(
              {
          "context": row.context,
          "qas": [
                   {
                      "id": str(idx),
                      "is_impossible": row.is_impossible,
                      "question": row.question,
                      "answers": [
                                  {
                          "text": row.short_answer,
                          "answer_start": row.context.find(row.short_answer),
                          }
                      ],
                  }
                ],
              },
      )

  return train_data

In [None]:
train_short_df = reformat_qa_data(train_short_df)
val_short_df = reformat_qa_data(val_short_df)

In [None]:
model_args = {
    "num_train_epochs": 4,
    "train_batch_size": 16,
    "n_best_size": 1,
    "overwrite_output_dir": True # To overwrite classification model saved in the output directory
}

In [None]:
model_QA = QuestionAnsweringModel(
    "bert",
    "bert-base-cased",
    args=model_args,
    use_cuda=True
)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
model_QA.train_model(train_short_df)

convert squad examples to features: 100%|██████████| 516/516 [00:07<00:00, 67.77it/s]
add example index and unique id: 100%|██████████| 516/516 [00:00<00:00, 405748.19it/s]


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/45 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/45 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/45 [00:00<?, ?it/s]

Running Epoch 4 of 4:   0%|          | 0/45 [00:00<?, ?it/s]

(180, 0.019081601617775352)

In [None]:
result, texts = model_QA.eval_model(val_short_df)

In [None]:
result

{'correct': 134,
 'similar': 223,
 'incorrect': 165,
 'eval_loss': -9.252604166666666}

In [None]:
IDX = np.random.randint(1,10)
question, paragraphs = test_df.iloc[IDX].question, test_df.iloc[IDX].text

In [None]:
question, paragraphs = test_df.iloc[IDX].question, test_df.iloc[IDX].text
print("Question:")
print("---")
print(question)
print("Article:")
print("---")
display(HTML(" ".join(paragraphs)))
print("xxxxxx")
print("xxxxxx")

Question:
---
when did they finish building the sydney opera house
Article:
---


xxxxxx
xxxxxx


In [None]:
answers, output = model.predict([question + " [SEP] " + remove_html(candidate) for candidate in paragraphs])

  self.pid = os.fork()


0it [00:00, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Make predictions with the question-answering model
to_predict = [
    {
        "context": remove_html(paragraphs[answers.argmax()]),
        "qas": [
            {
                "question": question,
                "id": "0",
            }
        ],
    }
]

answers, probabilities = model_QA.predict(to_predict)

  self.pid = os.fork()
convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 306.29it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 10565.00it/s]


Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
answers, probabilities

([{'id': '0', 'answer': ['The Sydney Opera House']}],
 [{'id': '0', 'probability': [0.9999621423877982]}])