In [73]:
import pandas as pd
import json
import os
import torch
from torch.utils.data import DataLoader,Dataset
from transformers import AutoModel,AutoTokenizer


In [74]:


# Set the directory path
data_dir = "data/onto5/"

# Find all train*.json files in the specified directory
json_files = glob.glob(os.path.join(data_dir, "train*.json"))

# Initialize an empty list to store dataframes
dfs = []

# Process each file
for file_path in json_files:
    # Initialize a list to store records from this file
    records = []
    
    # Read the JSONL file
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  # Skip empty lines
                # Parse each line as a JSON object
                record = json.loads(line)
                records.append(record)
    
    # Convert records to a dataframe
    if records:
        df = pd.DataFrame(records)
        dfs.append(df)
        print(f"Processed {file_path} with {len(records)} records")

# Concatenate all dataframes
if dfs:
    train_df = pd.concat(dfs, ignore_index=True)
    print(f"Combined DataFrame with {len(train_df)} rows")
    print(train_df.head())
else:
    print("No data found in the JSON files")

Processed data/onto5/train00.json with 15000 records
Processed data/onto5/train01.json with 15000 records
Processed data/onto5/train02.json with 15000 records
Processed data/onto5/train03.json with 14924 records
Combined DataFrame with 59924 rows
                                                tags  \
0                        [0, 0, 0, 0, 0, 0, 0, 0, 0]   
1   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]   
2                        [0, 0, 0, 0, 0, 0, 0, 0, 0]   
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   

                                              tokens  
0  [People, start, their, own, businesses, for, m...  
1  [But, a, chance, to, fill, out, sales, -, tax,...  
2  [Red, tape, is, the, bugaboo, of, small, busin...  
3  [Ironically, ,, the, person, who, wants, to, r...  
4  [Yet, every, business, owner, has, to, face, t...  


In [75]:
def get_sentence(x):
    return " ".join(x)

In [76]:
train_df["sentence"] = train_df["tokens"].apply(get_sentence)
train_df.shape


(59924, 3)

In [77]:
train_df.head()


Unnamed: 0,tags,tokens,sentence
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[People, start, their, own, businesses, for, m...",People start their own businesses for many rea...
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[But, a, chance, to, fill, out, sales, -, tax,...",But a chance to fill out sales - tax records i...
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[Red, tape, is, the, bugaboo, of, small, busin...",Red tape is the bugaboo of small business .
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Ironically, ,, the, person, who, wants, to, r...","Ironically , the person who wants to run his o..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Yet, every, business, owner, has, to, face, t...",Yet every business owner has to face the mound...


In [78]:
with open("data/onto5/label.json","r") as file:
    tag_to_idx = json.load(file)
    
idx_to_idx = {j:i for i,j in tag_to_idx.items()}

In [79]:
print(tag_to_idx)
print(idx_to_idx)

{'O': 0, 'B-CARDINAL': 1, 'B-DATE': 2, 'I-DATE': 3, 'B-PERSON': 4, 'I-PERSON': 5, 'B-NORP': 6, 'B-GPE': 7, 'I-GPE': 8, 'B-LAW': 9, 'I-LAW': 10, 'B-ORG': 11, 'I-ORG': 12, 'B-PERCENT': 13, 'I-PERCENT': 14, 'B-ORDINAL': 15, 'B-MONEY': 16, 'I-MONEY': 17, 'B-WORK_OF_ART': 18, 'I-WORK_OF_ART': 19, 'B-FAC': 20, 'B-TIME': 21, 'I-CARDINAL': 22, 'B-LOC': 23, 'B-QUANTITY': 24, 'I-QUANTITY': 25, 'I-NORP': 26, 'I-LOC': 27, 'B-PRODUCT': 28, 'I-TIME': 29, 'B-EVENT': 30, 'I-EVENT': 31, 'I-FAC': 32, 'B-LANGUAGE': 33, 'I-PRODUCT': 34, 'I-ORDINAL': 35, 'I-LANGUAGE': 36}
{0: 'O', 1: 'B-CARDINAL', 2: 'B-DATE', 3: 'I-DATE', 4: 'B-PERSON', 5: 'I-PERSON', 6: 'B-NORP', 7: 'B-GPE', 8: 'I-GPE', 9: 'B-LAW', 10: 'I-LAW', 11: 'B-ORG', 12: 'I-ORG', 13: 'B-PERCENT', 14: 'I-PERCENT', 15: 'B-ORDINAL', 16: 'B-MONEY', 17: 'I-MONEY', 18: 'B-WORK_OF_ART', 19: 'I-WORK_OF_ART', 20: 'B-FAC', 21: 'B-TIME', 22: 'I-CARDINAL', 23: 'B-LOC', 24: 'B-QUANTITY', 25: 'I-QUANTITY', 26: 'I-NORP', 27: 'I-LOC', 28: 'B-PRODUCT', 29: 'I-TIME

In [80]:
# Read the JSONL file

test_data = []
with open("data/onto5/test.json", 'r') as file:
    for line in file:
        if line.strip():  # Skip empty lines
            # Parse each line as a JSON object
            record = json.loads(line)
            test_data.append(record)


In [81]:
test_df = pd.DataFrame(test_data)
test_df.head()

Unnamed: 0,tags,tokens
0,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 7, 0, 7, 0, ...","[The, following, were, among, Friday, 's, offe..."
1,"[11, 12, 12, 12]","[Dow, Chemical, Co., --]"
2,"[16, 17, 17, 0, 13, 14, 0, 0, 0, 2, 3, 3, 3, 0...","[$, 150, million, of, 8.55, %, senior, notes, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, ...","[The, issue, ,, which, is, puttable, back, to,..."
4,"[0, 0, 0, 0, 0, 1, 0, 11, 12, 12, 12, 12, 0, 0...","[Rated, single, -, A, -, 1, by, Moody, 's, Inv..."


In [82]:
test_df["sentence"] = test_df["tokens"].apply(get_sentence)
test_df.shape


(8262, 3)

In [83]:
len(test_df)


8262

In [109]:
test_df.head(2).to_clipboard()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [96]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer("I love Indian")


{'input_ids': [101, 1045, 2293, 2796, 102], 'attention_mask': [1, 1, 1, 1, 1]}

In [102]:
class CustomDataSet(Dataset):
    def __init__(self,df,tokenizer,max_length=512):
        self.tokenizer = tokenizer
        self.max_length=max_length
        self.data = df

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        input_tokenize = self.tokenizer(row['sentence'],
                                        truncation=True,
                                        padding="max_length",
            max_length=self.max_length,
            return_tensors="pt")
        input_ids = torch.tensor(input_tokenize['input_ids']).squeeze()
        attention_mask = input_tokenize['attention_mask'].squeeze()
        label = row["tags"].copy()
        label.insert(0, 100) 
        label.append(-100)
        # Pad or truncate labels to match input_ids length
        if len(label) < self.max_length:
            label.extend([-100] * (self.max_length - len(label)))
        elif len(label) > self.max_length:
            label = label[:self.max_length]
            
        label = torch.tensor(label)
        return input_ids,attention_mask,label

In [105]:
train_dataSet = CustomDataSet(train_df,tokenizer,max_length=300)

In [106]:
print(len(train_dataSet.__getitem__(1)[0]))
print(len(train_dataSet.__getitem__(1)[1]))
print(len(train_dataSet.__getitem__(1)[2]))



300
300
300


  input_ids = torch.tensor(input_tokenize['input_ids']).squeeze()


In [107]:
print(train_dataSet.__getitem__(1))


(tensor([ 101, 2021, 1037, 3382, 2000, 6039, 2041, 4341, 1011, 4171, 2636, 2003,
        6524, 2028, 1997, 2068, 1012,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,  

  input_ids = torch.tensor(input_tokenize['input_ids']).squeeze()
