### Importing Libraries

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score
import random
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Load the Data

In [17]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# train_data = load_data('/Users/nikhilanu/Desktop/text_mining/final_project/data/train/train0.json')
# val_data = load_data('/Users/nikhilanu/Desktop/text_mining/final_project/data/dev/dev0.json')

In [32]:
def select_stratified_subset(data, subset_size):
    # Determine the distribution of relations
    relation_counts = {}
    for item in data:
        relation = item['relation']
        if relation not in relation_counts:
            relation_counts[relation] = []
        relation_counts[relation].append(item)

    # Calculate the number of items to sample per relation
    total_items = sum(len(items) for items in relation_counts.values())
    subset_counts = {relation: int(len(items) / total_items * subset_size) for relation, items in relation_counts.items()}

    # Ensure at least one sample per relation if subset size allows
    for relation in subset_counts:
        if subset_counts[relation] == 0 and subset_size > 0:
            subset_counts[relation] = 1
            subset_size -= 1  # Adjust subset_size for added samples

    # Sample items from each relation
    subset = []
    for relation, items in relation_counts.items():
        if subset_counts[relation] > 0:
            sampled_items = random.sample(items, subset_counts[relation])
            subset.extend(sampled_items)

    return subset


# Define your subset sizes
train_subset_size = 4000  # Adjust based on your dataset size and requirements
dev_subset_size = 2000
test_subset_size = 2000

train_data = load_data('/Users/nikhilanu/Desktop/text_mining/final_project/data/train.json')
dev_data = load_data('/Users/nikhilanu/Desktop/text_mining/final_project/data/dev.json')
test_data = load_data('/Users/nikhilanu/Desktop/text_mining/final_project/data/test.json')

# Sample a stratified subset from each dataset
stratified_train_data = select_stratified_subset(train_data, train_subset_size)
stratified_dev_data = select_stratified_subset(dev_data, dev_subset_size)
stratified_test_data = select_stratified_subset(test_data, test_subset_size)

# print(len(stratified_dev_data))
train_data = stratified_train_data
dev_data = stratified_dev_data
# test_data = stratified_test_data

In [33]:
for item in test_data:
    print(item['relation'])

no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
per:title
no_relation
no_relation
per:title
no_relation
per:title
no_relation
no_relation
org:top_members/employees
no_relation
no_relation
no_relation
no_relation
org:country_of_headquarters
no_relation
org:top_members/employees
org:top_members/employees
per:parents
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
per:age
no_relation
per:countries_of_residence
no_relation
no_relation
no_relation
no_relation
org:top_members/employees
per:title
no_relation
no_relation
no_relation
no_relation
no_relation
org:top_members/employees
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
per:children
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relation
no_relatio

### Preprocess the Dataset

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Encode relations
relation_types = ['org:city_of_headquarters', 'per:cities_of_residence', 'org:website', 'org:country_of_headquarters', 'per:origin', 'per:charges', 'org:parents', 'org:alternate_names', 'per:religion', 'per:stateorprovince_of_birth', 'per:other_family', 'org:members', 'org:shareholders', 'per:alternate_names', 'per:children', 'org:member_of', 'per:spouse', 'per:stateorprovinces_of_residence','per:title', 'per:city_of_death', 'per:age', 'per:date_of_death', 'per:country_of_birth', 'no_relation', 'org:number_of_employees/members','per:country_of_death', 'org:political/religious_affiliation', 'per:cause_of_death', 'per:city_of_birth', 'per:employee_of', 'org:dissolved', 'per:siblings', 'org:subsidiaries', 'per:schools_attended','per:date_of_birth', 'per:parents', 'org:top_members/employees', 'org:founded', 'per:stateorprovince_of_death', 'org:stateorprovince_of_headquarters', 'per:countries_of_residence', 'org:founded_by']  # Your list of relation types
label_encoder = LabelEncoder()
label_encoder.fit(relation_types)

    
def preprocess_data(data_item):
    # Tokenize the sentence
    tokens = data_item['token']
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
  
    
    # Create attention masks (assuming all tokens are relevant)
    attention_mask = [1] * len(input_ids)
    
    # Identify entity positions and create entity masks
    subj_positions = [0] * len(input_ids)
    obj_positions = [0] * len(input_ids)
    
    # Adjust for [CLS] token
    subj_start = data_item['subj_start'] 
    subj_end = data_item['subj_end'] 
    obj_start = data_item['obj_start'] 
    obj_end = data_item['obj_end'] 
    
    for i in range(subj_start, subj_end + 1):
        subj_positions[i] = 1
    for i in range(obj_start, obj_end + 1):
        obj_positions[i] = 1
    
    data_item['encoded_relation'] = label_encoder.transform([data_item['relation']])[0]

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'subj_positions': subj_positions,
        'obj_positions': obj_positions,
        'labels': data_item['encoded_relation']  # You might need to convert this to a numerical ID
    }

processed_data = [preprocess_data(item) for item in train_data]
val_processed_data = [preprocess_data(item) for item in dev_data]
test_processed_data = [preprocess_data(item) for item in test_data]

### Create a Custom Dataset

In [35]:
class TACREDDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'input_ids': torch.tensor(item['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(item['attention_mask'], dtype=torch.long),
            'subj_positions': torch.tensor(item['subj_positions'], dtype=torch.long),
            'obj_positions': torch.tensor(item['obj_positions'], dtype=torch.long),
            'labels': torch.tensor(item['labels'], dtype=torch.long)  # Adjust as necessary
        }



### Create a Data Loader


In [36]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    # Extracting input_ids, attention_mask, subj_positions, obj_positions, and labels
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    subj_positions = [item['subj_positions'] for item in batch]
    obj_positions = [item['obj_positions'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Padding sequences so they are all the same length
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    subj_positions_padded = pad_sequence(subj_positions, batch_first=True, padding_value=-100)  # Use an appropriate padding value for positions
    obj_positions_padded = pad_sequence(obj_positions, batch_first=True, padding_value=-100)
    labels = torch.stack(labels)  # Assuming labels can be directly stacked

    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_mask_padded,
        'subj_positions': subj_positions_padded,
        'obj_positions': obj_positions_padded,
        'labels': labels
    }

# Assuming dataset is an instance of TACREDDataset
dataset = TACREDDataset(processed_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


### Verifying the DataLoader

In [37]:
# Fetch the first batch
batch = next(iter(dataloader))

# Inspect the keys and shapes of the batch
print("Batch keys:", batch.keys())
for key, value in batch.items():
    print(f"{key}: Shape = {value.shape}")

# Optionally, inspect the contents more closely for a specific key
print("Input IDs:", batch['input_ids'])
print("Attention Masks:", batch['attention_mask'])
print("Subject Positions:", batch['subj_positions'])
print("Object Positions:", batch['obj_positions'])
print("Labels:", batch['labels'])


Batch keys: dict_keys(['input_ids', 'attention_mask', 'subj_positions', 'obj_positions', 'labels'])
input_ids: Shape = torch.Size([32, 143])
attention_mask: Shape = torch.Size([32, 143])
subj_positions: Shape = torch.Size([32, 143])
obj_positions: Shape = torch.Size([32, 143])
labels: Shape = torch.Size([32])
Input IDs: tensor([[ 101,  100, 1010,  ...,    0,    0,    0],
        [ 101,  100, 1012,  ...,    0,    0,    0],
        [ 101,  100, 2079,  ...,    0,    0,    0],
        ...,
        [ 101,  100,  100,  ...,    0,    0,    0],
        [ 101,  100, 2278,  ...,    0,    0,    0],
        [ 101,  100, 2084,  ...,    0,    0,    0]])
Attention Masks: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
Subject Positions: tensor([[   0,    0,    0,  ..., -100, -100, -100],
        [   0,    0,    0,  ..., -100, -100, -

### Creating the Model

In [24]:
import torch
import torch.nn as nn
from transformers import BertModel

class SubjectObjectAwareModel(nn.Module):
    def __init__(self, num_labels):
        super(SubjectObjectAwareModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Assuming the hidden size of BERT base model is 768
        self.hidden_size = 768
        self.num_labels = num_labels
        
        # Linear layers for subject and object
        self.subject_linear = nn.Linear(self.hidden_size, self.hidden_size)
        self.object_linear = nn.Linear(self.hidden_size, self.hidden_size)
        
        # Final classifier
        self.classifier = nn.Linear(self.hidden_size * 3, num_labels)  # *3 for [CLS], subject, and object
        
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask, token_type_ids, subject_positions, object_positions):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids)

        sequence_output = outputs.last_hidden_state
        pooled_output = outputs.pooler_output

        # Apply position masks to get subject and object representations
        subject_output = (sequence_output * subject_positions.unsqueeze(-1)).sum(1) / subject_positions.sum(1, keepdim=True)
        object_output = (sequence_output * object_positions.unsqueeze(-1)).sum(1) / object_positions.sum(1, keepdim=True)

        # Pass through the respective linear layers
        subject_output = self.subject_linear(subject_output)
        object_output = self.object_linear(object_output)

        # Concatenate pooled output (CLS token) with subject and object representations
        concat_output = torch.cat((pooled_output, subject_output, object_output), dim=1)
        concat_output = self.dropout(concat_output)

        logits = self.classifier(concat_output)

        return logits


### Training the model

In [25]:

# Model Initialization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_labels = 42 # Define the number of unique labels in your dataset
model = SubjectObjectAwareModel(num_labels=num_labels)
model.to(device)

# Optimizer and Loss Function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

# For illustration, assuming a fixed number of epochs
epochs = 5

# Training Loop
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch_num,batch in enumerate(dataloader, 1):
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        subject_positions = batch['subj_positions'].to(device)
        object_positions = batch['obj_positions'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask, None, subject_positions, object_positions)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if batch_num % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs} | Batch {batch_num}/{len(dataloader)} | Loss: {loss.item()}')


    # Logging
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")




Epoch 1/5 | Batch 10/125 | Loss: 1.1537340879440308
Epoch 1/5 | Batch 20/125 | Loss: 1.4025262594223022
Epoch 1/5 | Batch 30/125 | Loss: 0.5828226804733276
Epoch 1/5 | Batch 40/125 | Loss: 1.1595330238342285
Epoch 1/5 | Batch 50/125 | Loss: 1.417641520500183
Epoch 1/5 | Batch 60/125 | Loss: 0.7839060425758362
Epoch 1/5 | Batch 70/125 | Loss: 1.2582026720046997
Epoch 1/5 | Batch 80/125 | Loss: 1.378350019454956
Epoch 1/5 | Batch 90/125 | Loss: 1.9779138565063477
Epoch 1/5 | Batch 100/125 | Loss: 0.7217606902122498
Epoch 1/5 | Batch 110/125 | Loss: 0.3512522280216217
Epoch 1/5 | Batch 120/125 | Loss: 1.0636122226715088
Epoch 1/5, Loss: 1.1239779334068298
Epoch 2/5 | Batch 10/125 | Loss: 1.9580652713775635
Epoch 2/5 | Batch 20/125 | Loss: 1.1204661130905151
Epoch 2/5 | Batch 30/125 | Loss: 0.6420814990997314
Epoch 2/5 | Batch 40/125 | Loss: 0.9311279058456421
Epoch 2/5 | Batch 50/125 | Loss: 1.0991325378417969
Epoch 2/5 | Batch 60/125 | Loss: 1.2907793521881104
Epoch 2/5 | Batch 70/125 | 

In [None]:
with open('model1.pkl', 'wb') as file:
    pickle.dump(model, file)

### Evaluating the Model

In [27]:
with open('model_filename.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [38]:
# Assuming validation_data is preprocessed
validation_dataset = TACREDDataset(val_processed_data)
validation_dataloader = DataLoader(validation_dataset, batch_size=32, collate_fn=collate_fn)


test_dataset = TACREDDataset(test_processed_data)
test_dataloader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

def evaluate_model(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    true_labels = []
    total_eval_loss = 0

    with torch.no_grad():  # Deactivate autograd for evaluation
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            subject_positions = batch['subj_positions'].to(device)
            object_positions = batch['obj_positions'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask, None, subject_positions, object_positions)
            
            # Calculate loss
            loss = loss_fn(outputs, labels)
            total_eval_loss += loss.item()
            
            # Convert logits to predictions
            _, preds = torch.max(outputs, dim=1)
            
            # Move preds and labels to CPU
            predictions.extend(preds.detach().cpu().numpy())
            true_labels.extend(labels.detach().cpu().numpy())

    # Calculate the average loss and accuracy
    avg_loss = total_eval_loss / len(dataloader)
    accuracy = accuracy_score(true_labels, predictions)

    print(f"Validation Loss: {avg_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")

    return true_labels,predictions
    
# Assuming you have a validation dataloader named `val_dataloader`
true_labels,predictions = evaluate_model(loaded_model, test_dataloader, device)


Validation Loss: 1.0427
Validation Accuracy: 0.7783


In [39]:
f = open("predictions.txt","w")
for label in predictions:
    decoded_relation = label_encoder.inverse_transform([label])[0]
    f.write(decoded_relation+"\n")
f.close()

f = open("true_labels.txt","w")
for label in true_labels:
    decoded_relation = label_encoder.inverse_transform([label])[0]
    f.write(decoded_relation+"\n")
f.close()

### Verifying Model

In [30]:
from transformers import BertTokenizer
import torch

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = SubjectObjectAwareModel(num_labels=2)  # Adjust `num_labels` as per your task

# Create a synthetic example
text = "Hello, my name is John."
inputs = tokenizer(text, return_tensors="pt")
subject_positions = torch.tensor([[0, 0, 1, 0, 0, 0, 0, 0, 0]])  # Example positions
object_positions = torch.tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0]])

# Forward pass
with torch.no_grad():
    logits = model(**inputs, subject_positions=subject_positions, object_positions=object_positions)

print("Logits:", logits)


Logits: tensor([[ 0.1709, -0.1999]])


In [31]:
# Dummy dataset
inputs = tokenizer(["Hello, my name is John.", "Hi, I'm Jane."], padding=True, return_tensors="pt")
subject_positions = torch.tensor([[0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0]])
object_positions = torch.tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0]])
labels = torch.tensor([0, 1])  # Example labels

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

# Training step
model.train()
for epoch in range(3):  # A few epochs for demonstration
    optimizer.zero_grad()
    logits = model(**inputs, subject_positions=subject_positions, object_positions=object_positions)
    loss = torch.nn.functional.cross_entropy(logits, labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 0.693174421787262
Epoch 2, Loss: 0.4813253879547119
Epoch 3, Loss: 0.3820960521697998
