In [1]:
# !pip install --quiet  datasets #to access squad dataset
# !pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
# !pip install --quiet  tqdm     #for progress bars
# !pip install --quiet transformers # for t5 model
# !pip install --quiet tokenizers  #tokenizers from HuggingFace
# !pip install --quiet sentencepiece #subword tokenizer used by T5
# !pip install --quiet pytorch-lightning # pytorch wrapper 
# !pip install --quiet torchtext # text utilities

# Fetching Datasets

In [2]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy
import numpy as np
from collections import defaultdict
import ipdb

# pd.options.display.max_rows , pd.options.display.max_columns  = 100,100  

In [3]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
# path_to_source = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_full_5_10_10000_clone_latex_compare/10Neg10000unk/twofoldwithunk"
path_to_csv = f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_ibm_150_5_10_10000/10Neg10000unk/twofoldwithunk"

fold1 = "fold1"
train_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f1_pd = pd.read_csv(f"{path_to_csv}/{fold1}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

fold2 = "fold2"
train_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/train.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])
dev_f2_pd = pd.read_csv(f"{path_to_csv}/{fold2}/dev.tsv", 
                    sep="\t", names=["label", "title", "TDM", "Context"])

In [5]:
# with open(f'/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_latex_text/arxiv_txt/1805.04554v4.txt', 'r') as file:
#     # Read the file
#     data = file.read()
    
# len(data)

In [6]:
# data[:1000]

In [7]:
# no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_full.tsv", 
#                     sep="\t", names=["title", "Context"])

no_leaderboard_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/T5-Leaderboard-QA/data_proccess/arxiv_no_leaderboard_links_pdf_short/DocTAET_150.tsv", 
                    sep="\t", names=["title", "Context"])

no_leaderboard_pd.describe()

Unnamed: 0,title,Context
count,4369,4369
unique,4369,4365
top,0912.4438.pdf,! !
freq,1,2


In [8]:
resultsAnnotation_pd = pd.read_csv(f"/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/annotations_final/resultsAnnotation.tsv",
                                   sep="\t", names=["Title", "TDMSs"])
resultsAnnotation_pd = resultsAnnotation_pd.fillna("NAN")
resultsAnnotation_pd

Unnamed: 0,Title,TDMSs
0,1704.03549v4.pdf,Optical Character Recognition#FSNS - Test#Sequ...
1,1712.05404.pdf,Optical Character Recognition#FSNS - Test#Sequ...
2,1702.03970v1.pdf,Optical Character Recognition#FSNS - Test#Sequ...
3,2104.02324v1.pdf,"Active Object Detection#COCO#AP#(7.3, 13.8, 16..."
4,2008.12995v3.pdf,Handwriting Recognition#BanglaLekha Isolated D...
...,...,...
5724,2104.01378v1.pdf,Phone-level pronunciation scoring#speechocean7...
5725,2104.10283v1.pdf,Graph Question Answering#GQA#Accuracy#96.30
5726,2104.11980v1.pdf,Trajectory Modeling#NBA SportVU#1x1 NLL#0.472
5727,1704.00077v1.pdf,Video Segmentation#SegTrack v2#Accuracy#86.86


In [9]:
"""
This will take care of papers with more than one learderboard 
"""
records = resultsAnnotation_pd.to_dict("records")

title_to_tdms_dict = defaultdict(
    lambda : 
        list()
    )

for i, row in tqdm(enumerate(records), total = len(records)):
    if row['TDMSs'] == 'NAN':
        continue

    for tdms in row['TDMSs'].split("$"):
        if len(tdms.split("#")) != 4:
            # ipdb.set_trace()
            continue 
        t, d, m, s = tdms.split("#")
        title_to_tdms_dict[row['Title']].append(
            {
                "leaderboard": {
                    "Task": t,
                    "Dataset": d,
                    "Metric": m,
                    "Score": s,
                }
            }            
        )

 35%|███▌      | 2006/5729 [00:00<00:00, 13643.55it/s]

100%|██████████| 5729/5729 [00:00<00:00, 29172.76it/s]


In [10]:
len(title_to_tdms_dict)

5725

In [11]:
# title_to_tdms_dict

In [12]:
# Update to create data for Train and Dev 
# train_pd = train_pd
# train_pd = dev_pd

train_f1_pd.describe()

Unnamed: 0,label,title,TDM,Context
count,50143,50143,50143,50143
unique,2,3753,1851,3747
top,False,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,37530,68,932,68


In [13]:
dev_f1_pd.describe()

Unnamed: 0,label,title,TDM,Context
count,21552,21552,21552,21552
unique,2,1608,1851,1606
top,False,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,16080,68,382,68


In [14]:
# No need for negative instances, but will still have 'duplicate' for paper with more than one leaderboard
train_f1_pd = train_f1_pd[train_f1_pd.label==True]
print("train_f1_pd")
display(train_f1_pd.describe())

dev_f1_pd = dev_f1_pd[dev_f1_pd.label==True]
print("dev_f1_pd")
display(dev_f1_pd.describe())

train_f2_pd = train_f2_pd[train_f2_pd.label==True]
print("train_f2_pd")
display(train_f2_pd.describe())

dev_f2_pd = dev_f2_pd[dev_f2_pd.label==True]
print("dev_f2_pd")
display(dev_f2_pd.describe())

train_f1_pd


Unnamed: 0,label,title,TDM,Context
count,12613,12613,12613,12613
unique,1,3753,1792,3747
top,True,1803.00933v1.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,12613,58,923,58


dev_f1_pd


Unnamed: 0,label,title,TDM,Context
count,5472,5472,5472,5472
unique,1,1608,1557,1606
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,5472,58,378,58


train_f2_pd


Unnamed: 0,label,title,TDM,Context
count,12677,12677,12677,12677
unique,1,3753,1821,3749
top,True,1911.08265v2.pdf,unknown,"Mastering Atari, Go, Chess and Shogi by Planni..."
freq,12677,58,920,58


dev_f2_pd


Unnamed: 0,label,title,TDM,Context
count,5408,5408,5408,5408
unique,1,1608,1542,1608
top,True,1802.01561v3.pdf,unknown,IMPALA: Scalable Distributed Deep-RL with Impo...
freq,5408,58,381,58


In [15]:
# len(train_pd.title.unique())

In [18]:
records = train_pd.to_dict("records")
# title_to_tdms_dict = defaultdict(lambda : defaultdict(lambda : str("| ")))
title_to_content = {}

for i, row in tqdm(enumerate(records), total = len(records)):
    
    title_id = row['title'].split(".pdf")[0]
    
    try:
        with open(f'/nfs/home/kabenamualus/Research/task-dataset-metric-nli-extraction/data/pwc_latex_text/arxiv_txt/{title_id}.txt', 'r') as file:
            # Read the file
            data = file.read()
        
    except :
#         print(f"Error on file {row['title']}")
        continue 

    if row['title'] in title_to_content:
        continue 
    else:
#         title_to_content[row['title']] = row['Context']
        title_to_content[row['title']] = data

100%|██████████| 12613/12613 [00:06<00:00, 1968.32it/s]


In [19]:
no_leaderboard_pourcentage = int(len(train_pd.title.unique())*50/100)
no_leaderboard_pourcentage

1876

In [52]:
# len(no_lead_papers)

In [53]:
records = no_leaderboard_pd.to_dict("records")

# # For train only
# no_lead_papers = []

already_seen = no_lead_papers
no_lead_papers = []
for i, row in tqdm(enumerate(records), total = len(records)):
    
    if row['title'] in already_seen:
        continue 
        
    if i >= no_leaderboard_pourcentage:
        break 
    
    title_to_content[row['title']] = row['Context']
    no_lead_papers.append(row['title'])


 43%|████▎     | 1876/4369 [00:00<00:00, 41568.79it/s]


In [54]:
len(title_to_content)

1608

In [55]:
# list(title_to_latex_content.keys())[:5]

In [56]:
# title_to_latex_content['1801.01315v1.pdf'][:1000]

In [57]:
train_pd["Lenght context"] = train_pd.Context.apply(lambda x: len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [58]:
# train_pd[train_pd["Lenght context"] < 400]

In [59]:
# train_pd = train_pd[train_pd["Lenght context"] < 400]

In [60]:
train_pd.describe()

Unnamed: 0,Lenght context
count,5472.0
mean,383.148757
std,114.608898
min,31.0
25%,316.0
50%,374.0
75%,448.0
max,1750.0


In [61]:
# records = train_pd.to_dict("records")
# db_dict = defaultdict(lambda : defaultdict(lambda : str("| ")))
# TDMs_dict = defaultdict(lambda : defaultdict(lambda : set()))
# tasks = set()
# datasets = set()
# metrics = set()
# for i, row in tqdm(enumerate(records), total = len(records)):
#     if len(row['TDM'].split(";")) != 3:
#         # case of unknown TDM
#         # ipdb.set_trace()    
#         continue 
    
#     t, d, m = row['TDM'].split(";")
    
#     TDMs_dict[row['title']]['Tasks'].add(t)
#     TDMs_dict[row['title']]['Datasets'].add(d)
#     TDMs_dict[row['title']]['Metrics'].add(m)

#     db_dict[row['title']] = {
#         "TDMs": db_dict[row['title']]['TDMs']+row['TDM']+" | ",
#         "Context": row['Context']      
#         }

In [62]:
df = pd.DataFrame(columns = ["Title", "TDMSs", "Context"])

for i, title in tqdm(enumerate(title_to_content.keys()), total = len(title_to_content.keys())):
    
    if (len(title_to_content[title]) < 10):
        continue 
    
    
    if (title not in no_lead_papers) :
        if (title_to_tdms_dict[title] == []):
             continue
          
    
    # title_id = row['title'].split(".pdf")[0]    
    df = pd.concat([df, pd.DataFrame.from_records(
        [
            {
                'Title' : title, 
                'TDMSs' : title_to_tdms_dict[title] if title in title_to_tdms_dict.keys() else [
                    {
                        "leaderboard": {
                            "Task": "",
                            "Dataset": "",
                            "Metric": "",
                            "Score": "",
                        }
                    }],
                'Context' : title_to_content[title]
            }
        ])], ignore_index = True)
df.head()

100%|██████████| 1608/1608 [00:00<00:00, 2107.81it/s]


Unnamed: 0,Title,TDMSs,Context
0,1810.02575v1.pdf,[{'leaderboard': {'Task': 'Semantic Segmentati...,Dark Model Adaptation: Semantic Image Segmenta...
1,1909.00794v1.pdf,[{'leaderboard': {'Task': 'Scene Text Detectio...,Geometry Normalization Networks for Accurate S...
2,1807.10066v1.pdf,[{'leaderboard': {'Task': 'Action Recognition'...,A Better Baseline for AVA We introduce a simpl...
3,1805.04554v4.pdf,[{'leaderboard': {'Task': 'Semantic Segmentati...,ContextNet: Exploring Context and Detail for S...
4,1504.01013v4.pdf,[{'leaderboard': {'Task': 'Semantic Segmentati...,Efficient Piecewise Training of Deep Structure...


In [63]:
# df.loc[1, "TDMSs"]

In [64]:
df["Lenght context"] = df.Context.apply(lambda x: len(x.split()))
df["Lenght TDMSs"] = df.TDMSs.apply(lambda x: len(x))


df.describe()

Unnamed: 0,Lenght context,Lenght TDMSs
count,1549.0,1549.0
mean,383.714655,5.745642
std,119.065173,8.353842
min,31.0,1.0
25%,312.0,2.0
50%,380.0,3.0
75%,454.0,7.0
max,1750.0,170.0


In [65]:
df[df["Lenght TDMSs"]>200]

Unnamed: 0,Title,TDMSs,Context,Lenght context,Lenght TDMSs


In [66]:
len(df)

1549

In [67]:
df.loc[4, "TDMSs"]

[{'leaderboard': {'Task': 'Semantic Segmentation',
   'Dataset': 'Cityscapes test',
   'Metric': 'Mean IoU (class)',
   'Score': '71.6%'}},
 {'leaderboard': {'Task': 'Semantic Segmentation',
   'Dataset': 'PASCAL Context',
   'Metric': 'mIoU',
   'Score': '43.3'}}]

In [68]:
template = ['Please answer a question about this article. If the question is unanswerable, say \"unanswerable\"',
'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".',
'If the question is unanswerable, say \"unanswerable\"',
'Try to answer this question if possible (otherwise reply \"unanswerable\"',
'If it is possible to answer this question, answer it for me (else, reply \"unanswerable\"',
'Answer this question, if possible (if impossible, reply \"unanswerable\"',
'Read this: What is the answer? (If it cannot be answered, return \"unanswerable\"',
'Read this: Now answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"',
'Answer based on context:',
'Answer this question based on the article:',
# ("{context}\n\n{question}", "{answer}"),
'Answer this question:',
'Read this article and answer this question',
'Based on the above article, answer a question.',
# 'Context: {context}\n\nQuestion: {question}\n\nAnswer:", "{answer}"),
]

template

['Please answer a question about this article. If the question is unanswerable, say "unanswerable"',
 'Read this and answer the question. If the question is unanswerable, say "unanswerable".',
 'If the question is unanswerable, say "unanswerable"',
 'Try to answer this question if possible (otherwise reply "unanswerable"',
 'If it is possible to answer this question, answer it for me (else, reply "unanswerable"',
 'Answer this question, if possible (if impossible, reply "unanswerable"',
 'Read this: What is the answer? (If it cannot be answered, return "unanswerable"',
 'Read this: Now answer this question, if there is an answer (If it cannot be answered, return "unanswerable"',
 'Answer based on context:',
 'Answer this question based on the article:',
 'Answer this question:',
 'Read this article and answer this question',
 'Based on the above article, answer a question.']

In [69]:
df.head(2)

Unnamed: 0,Title,TDMSs,Context,Lenght context,Lenght TDMSs
0,1810.02575v1.pdf,[{'leaderboard': {'Task': 'Semantic Segmentati...,Dark Model Adaptation: Semantic Image Segmenta...,315,1
1,1909.00794v1.pdf,[{'leaderboard': {'Task': 'Scene Text Detectio...,Geometry Normalization Networks for Accurate S...,403,6


In [70]:
def create_pandas_dataset_from_pandas(df,
                          answer_threshold=7,
                          verbose = False):

  ''' Create a Pandas Dataframe from pandas.
  Params:
        answer_threshold: Only consider those Question Answer pairs where the Answer is short.
  '''
  count_index = 0
  result_df  = pd.DataFrame(columns = ['template_question', 'answer'])   
  # q_type_1 = "Which Tasks are addressed in this article"
  # q_type_2 = "Which Datasets are addressed in this article"
  # q_type_3 = "Which Metrics are addressed in this article"
  # q_type_4 = "Which Tasks, Datasets, Metrics are addressed in this article"
  # q_type_5 = "Which Tasks, Datasets, Metrics and Scores are addressed in this article" 
  
  # q_types = [
  #   {"q": "Which Tasks are addressed in this article", "a_key": "Tasks"}, 
  #   {"q": "Which Datasets are addressed in this article", "a_key": "Datasets"}, 
  #   {"q": "Which Metrics are addressed in this article", "a_key": "Metrics"},
  #   {"q": "Which Tasks, Datasets, Metrics are addressed in this article", "a_key": "TDMs"},
  #   {"q": "Which Tasks, Datasets, Metrics and Scores are addressed in this article", "a_key": "TDMSs"}
  #   ]
  
  q_types = [
    # {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, and metric?", "a_key": "TDMSs"},
    {"q": "What are the values for the following properties to construct a Leaderboard for the model introduced in this article: task, dataset, metric, and score?", "a_key": "TDMSs"},
    ]
  
  records = df.to_dict("records")
  # db_dict = defaultdict(lambda : list())
  for i, row in tqdm(enumerate(records), total = len(records)):        
      for q_type in q_types:
        
        # Squad_v2
        result_df.loc[count_index] = [f'{row["Context"]}\n\nPlease answer a question about this article. If the question is unanswerable, say \"unanswerable\". {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this and answer the question. If the question is unanswerable, say \"unanswerable\".\n\n{row["Context"]}\n\n{q_type["q"]}'
] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n{q_type["q"]} (If the question is unanswerable, say \"unanswerable\"'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nTry to answer this question if possible (otherwise reply \"unanswerable\"): {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nIf it is possible to answer this question, answer it for me (else, reply \"unanswerable\"): {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question, if possible (if impossible, reply \"unanswerable\"): {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this: {row["Context"]}\n\n{q_type["q"]}\nWhat is the answer? (If it cannot be answered, return \"unanswerable\")'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this: {row["Context"]}\nNow answer this question, if there is an answer (If it cannot be answered, return \"unanswerable\"): {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        
        # Drop
        result_df.loc[count_index] = [f'Answer based on context:\n\n{row["Context"]}\n\n{q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nAnswer this question based on the article: {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\n{q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\nAnswer this question: {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'Read this article and answer this question {row["Context"]}\n{q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'{row["Context"]}\n\nBased on the above article, answer a question. {q_type["q"]}'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
        result_df.loc[count_index] = [f'Context: {row["Context"]}\n\nQuestion: {q_type["q"]}\n\nAnswer:'] \
          + [row[q_type["a_key"]]] 
        count_index += 1
         
            
  if verbose:
    # return (result_df,
    #         count_long,
    #         count_short)
    return (result_df)
  else:
    return result_df

In [71]:
df_train  = create_pandas_dataset_from_pandas(df) 
# df_validation = create_pandas_dataset_from_pandas(df)


  1%|          | 12/1549 [00:00<00:26, 57.84it/s]100%|██████████| 1549/1549 [00:32<00:00, 48.02it/s]


In [72]:
df_train.tail()

Unnamed: 0,template_question,answer
23230,EfficientNet: Rethinking Model Scaling for Con...,[{'leaderboard': {'Task': 'Image Classificatio...
23231,EfficientNet: Rethinking Model Scaling for Con...,[{'leaderboard': {'Task': 'Image Classificatio...
23232,Read this article and answer this question Eff...,[{'leaderboard': {'Task': 'Image Classificatio...
23233,EfficientNet: Rethinking Model Scaling for Con...,[{'leaderboard': {'Task': 'Image Classificatio...
23234,Context: EfficientNet: Rethinking Model Scalin...,[{'leaderboard': {'Task': 'Image Classificatio...


In [73]:
len(df_train)

23235

In [74]:
df_train.describe()

# df_validation.describe()

Unnamed: 0,template_question,answer
count,23235,23235
unique,23220,1548
top,Read this article and answer this question PAN...,[{'leaderboard': {'Task': 'Monocular Depth Est...
freq,2,30


In [75]:
print(15*3640)

54600


In [76]:
len(df_train)

23235

In [77]:
# len(df_validation)

In [78]:
# Saving data for future use

# v2 include no_leaderboard 
# df_train.to_parquet('data/train_tdms_f1_v2_short.parquet')
# df_train.to_parquet('data/dev_tdms_f1_v2_short.parquet')

# df_train.to_parquet('data/train_tdms_f2_v2_short.parquet')
# df_train.to_parquet('data/dev_tdms_f2_v2_short.parquet')



df_train.to_parquet('data/train_tdm_f1_v2_short.parquet')
# df_train.to_parquet('data/dev_tdm_f1_v2_short.parquet')

# df_train.to_parquet('data/train_tdm_f2_v2_short.parquet')
# df_train.to_parquet('data/dev_tdm_f2_v2_short.parquet')



# ==================================================================================================== #


# mix latex (when available) and docteat content 
# df_train.to_parquet('train_tdm_f1_v1_long.parquet')
# df_train.to_parquet('dev_tdm_f1_v1_long.parquet')