In [1]:
import pandas as pd
import sys
import os
import numpy as np
import re 
from re import split
import torch
from transformers import AutoTokenizer
import csv

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_json('./datafinder/datafinder_dataset/train_data.jsonl', lines=True)

df_dataset_information = pd.read_json('./datafinder/datafinder_dataset/dataset_search_collection.jsonl', lines=True)
df.drop_duplicates(subset=['paper_id'], inplace=True)

In [3]:
pattern = (
    r'\\\(.*?\\\)|'  # LaTeX inline math (\(...\))
    r'\r\n|'  # Line breaks
    r'\*\*|'  # Asterisks
    r'\$.*?\$|'  # LaTeX inline math ($...$)
    r'\\\[.*?\\\]|'  # LaTeX display math (\[...\])
    r'https?://\S+|'  # URLs starting with http:// or https://
    r'www\.\S+|'  # URLs starting with www.
    r'ftp://\S+|'  # URLs starting with ftp://
    r'\\begin\{equation\}.*?\\end\{equation\}|'  # LaTeX display math (\begin{equation}...\end{equation})
    r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?(?:\{[^}]*\})?|' # LaTeX commands
    r'\\langle.*?\\rangle|'  # LaTeX angle brackets (\langle...\rangle)
    r'https?://[^\s]+(?:[\s\.,]|$)|'  # Match http or https URLs, followed by space, dot, or end of string
    r'www\.[^\s]+(?:[\s\.,]|$)'  # Match URLs starting with www., followed by space, dot, or end of string
    r'\[Image Source\: \[.*?\]|'
    r'\(Image Source\: \[|'
    r'\[Image Source\: \[|'
    r'\(Source\: \[.*?\]|'
    r'Source\: \[|'  
    r'\(\s*/paper/[^)]+\s*\)'
    
)  

df['abstract'] = df['abstract'].apply(lambda x: re.sub(pattern, '', x))

# df_dataset_information['contents'] = df_dataset_information['contents'].apply(lambda x: re.sub(pattern, '', x))

df_graph = df[['paper_id', 'outbound_citations', 'positives', 'negatives']]

df_paper = df[['title', 'abstract']]

In [4]:
pprIdx = {paper_id: idx for idx, paper_id in enumerate(df['paper_id'])}

print(len(pprIdx))

17397


In [None]:
dataset_to_label = {} 
label_counter = 0

for datasets in df_graph['positives']:
    for dataset in datasets:
        if dataset not in dataset_to_label:
            dataset_to_label[dataset] = label_counter
            label_counter += 1

print(dataset_to_label)
print(label_counter)

{'CIFAR-10': 0, 'ImageNet': 1, 'CAT2000': 2, 'Hopkins155': 3, 'VRD': 4, 'Middlebury': 5, 'KITTI': 6, 'LAMA': 7, 'COCO': 8, 'Jester': 9, 'JFLEG': 10, 'IAM': 11, 'Set5': 12, 'AFW': 13, 'FRGC': 14, 'WikiBio': 15, 'Cora': 16, 'BSDS500': 17, 'MovieLens': 18, 'COMA': 19, 'UCF101': 20, 'CARLA': 21, 'Birdsnap': 22, 'PACS': 23, 'VCR': 24, 'ARC': 25, 'MultiRC': 26, 'AudioSet': 27, 'Flickr30k': 28, 'CelebA': 29, 'MuJoCo': 30, 'ReferItGame': 31, 'WebText': 32, 'WikiText-103': 33, 'DeepFashion': 34, 'DRCD': 35, 'NewsQA': 36, 'SQuAD': 37, 'WN18': 38, 'AFLW': 39, 'Helen': 40, 'SumMe': 41, 'Django': 42, 'CompCars': 43, 'ETH': 44, 'Caltech-101': 45, 'LFPW': 46, 'Cityscapes': 47, 'RaFD': 48, 'ECSSD': 49, 'SimpleQuestions': 50, 'QNLI': 51, 'MRPC': 52, 'RACE': 53, 'GLUE': 54, 'SNLI': 55, 'DRIVE': 56, 'ShapeNet': 57, 'MultiNLI': 58, 'DAQUAR': 59, 'IEMOCAP': 60, 'VIPeR': 61, 'STARE': 62, 'ShanghaiTech': 63, 'Pix3D': 64, 'CAD-120': 65, 'HandNet': 66, 'CoQA': 67, 'HMDB51': 68, 'FaceForensics': 69, 'DAVIS': 70

In [6]:
dataset_idcontent_all = []

for _, row in df_dataset_information.iterrows():
    dataset_name = row['id']
    dataset_content = row['contents']
    if dataset_name in dataset_to_label.keys():
        dataset_target_id = dataset_to_label[dataset_name]
        dataset_idcontent_all.append((dataset_target_id, dataset_name, dataset_content))

df_dataset_idcontent = pd.DataFrame(dataset_idcontent_all, columns=['dataset_target_id', 'dataset_name', 'dataset_content'])

df_dataset_idcontent.sort_values(by='dataset_target_id', inplace=True, ignore_index=True)
df_dataset_idcontent.drop_duplicates(subset=['dataset_target_id'], inplace=True, ignore_index=True)
print(df_dataset_idcontent)

     dataset_target_id dataset_name  \
0                    0     CIFAR-10   
1                    1     ImageNet   
2                    2      CAT2000   
3                    3   Hopkins155   
4                    4          VRD   
..                 ...          ...   
456                456     WildDash   
457                457    Video2GIF   
458                458     FigureQA   
459                459      PHM2017   
460                460    ContactDB   

                                       dataset_content  
0    The **CIFAR-10** dataset (Canadian Institute f...  
1    The **ImageNet** dataset contains 14,197,122 a...  
2    Includes 4000 images; 200 from each of 20 cate...  
3    The Hopkins 155 dataset consists of 156 video ...  
4    The Visual Relationship Dataset (**VRD**) cont...  
..                                                 ...  
456  WildDash is a benchmark evaluation method is p...  
457  The **Video2GIF** dataset contains over 100,00...  
458  FigureQA is a

In [7]:
pattern_dataset = (
    r'\\\(.*?\\\)|'  # LaTeX inline math (\(...\))
    r'\r\n|'  # Line breaks
    r'\*\*|'  # Asterisks
    r'\*|'  # Asterisks
    r'\$.*?\$|'  # LaTeX inline math ($...$)
    r'\\\[.*?\\\]|'  # LaTeX display math (\[...\])
    r'\\[a-zA-Z]+\*?(?:\[[^\]]*\])?(?:\{[^}]*\})?|' # LaTeX commands
    r'\n|'
    r'\n+|'

    r'\(https?://\S+\)|'  # URLs starting with http:// or https://
    r'(Source:|Image Source:|Image:|NOTE: ).*'
    
)  

def clean_text(text):
    cleaned = re.sub(pattern_dataset, '', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = cleaned.strip()
    return cleaned

df_dataset_idcontent['dataset_content'] = df_dataset_idcontent['dataset_content'].apply(clean_text)

In [None]:
class Textual_Feature:
    def __init__(self, checkpoint= 'allenai/scibert_scivocab_uncased'):
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    def encode(self, paper_text):
        tok_text = self.tokenizer(paper_text,
                             truncation = True,
                             max_length = 512, 
                             padding = 'max_length',
                             return_tensors='pt')
        
        if 'token_type_ids' in tok_text:
            del tok_text['token_type_ids']
        return tok_text['input_ids'], tok_text['attention_mask']

In [9]:
tqdm.pandas()
textual_feature = Textual_Feature()
df_paper['abstract'] =  df_paper['abstract'].str.lower()
df_paper['title'] =  df_paper['title'].str.lower()
paper_texts = df_paper['title'] + ' ' + df_paper['abstract']

dataset_text = df_dataset_idcontent['dataset_content']


token_dataset = dataset_text.progress_apply(lambda text: textual_feature.encode(text))
token_paper = paper_texts.progress_apply(lambda text: textual_feature.encode(text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paper['abstract'] =  df_paper['abstract'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paper['title'] =  df_paper['title'].str.lower()
100%|██████████| 461/461 [00:00<00:00, 4236.31it/s]
100%|██████████| 17397/17397 [00:06<00:00, 2680.40it/s]


In [10]:
'''for dataset'''
input_ids_list_dataset = [result[0] for result in token_dataset]
attention_mask_list_dataset = [result[1] for result in token_dataset]

input_ids_tensor_dataset = torch.cat(input_ids_list_dataset, dim=0)
attention_mask_tensor_dataset = torch.cat(attention_mask_list_dataset, dim=0)

'''for paper'''

input_ids_list_paper = [result[0] for result in token_paper]
attention_mask_list_paper = [result[1] for result in token_paper]

input_ids_tensor_paper = torch.cat(input_ids_list_paper, dim=0)
attention_mask_tensor_paper = torch.cat(attention_mask_list_paper, dim=0)

In [11]:
file = './datafinder/train_structural.txt'
output_csv = './datafinder/train.csv'

with open(file, 'r') as f:
    data = []
    for line in f:
        items = split(' ', line.strip())
        paper_id = int(items[0])
        dataset_id = int(items[1])
        weight = items[2]
        input_ids_p = input_ids_tensor_paper[paper_id]
        attention_mask_p = attention_mask_tensor_paper[paper_id]
        input_ids_d = input_ids_tensor_dataset[dataset_id]
        attention_mask_d = attention_mask_tensor_dataset[dataset_id]

        row = {
            'paper_id': paper_id,
            'dataset_id': dataset_id,
            'weight': float(weight),
            'input_ids_p': f'input_ids_p: {input_ids_p}',
            'attention_mask_p': f'attention_mask_p: {attention_mask_p}',
            'input_ids_d': f'input_ids_d: {input_ids_d}',
            'attention_mask_d': f'attention_mask_d: {attention_mask_d}'
        }

        data.append(row)

with open(output_csv, 'w', newline='') as csvfile:
    fieldnames = ['paper_id', 'dataset_id', 'weight', 'input_ids_p', 'attention_mask_p', 'input_ids_d', 'attention_mask_d']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerows(data)
        

In [12]:
file = './datafinder/test_structural.txt'
output_csv = './datafinder/test.csv'

with open(file, 'r') as f:
    data = []
    for line in f:
        items = split(' ', line.strip())
        paper_id = int(items[0])
        dataset_id = int(items[1])
        weight = items[2]
        input_ids_p = input_ids_tensor_paper[paper_id]
        attention_mask_p = attention_mask_tensor_paper[paper_id]
        input_ids_d = input_ids_tensor_dataset[dataset_id]
        attention_mask_d = attention_mask_tensor_dataset[dataset_id]

        row = {
            'paper_id': paper_id,
            'dataset_id': dataset_id,
            'weight': float(weight),
            'input_ids_p': f'input_ids_p: {input_ids_p}',
            'attention_mask_p': f'attention_mask_p: {attention_mask_p}',
            'input_ids_d': f'input_ids_d: {input_ids_d}',
            'attention_mask_d': f'attention_mask_d: {attention_mask_d}'
        }

        data.append(row)

with open(output_csv, 'w', newline='') as csvfile:
    fieldnames = ['paper_id', 'dataset_id', 'weight', 'input_ids_p', 'attention_mask_p', 'input_ids_d', 'attention_mask_d']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerows(data)