In [6]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoConfig, AutoTokenizer

HASH_NAME = "kmyc4k23vc47"
# Config for the model
CONFIG = {"seed": 42,
          "epochs": 3,
          "model_name": "microsoft/deberta-v3-base",
          "train_batch_size": 8,
          "valid_batch_size": 16,
          "max_length": 512,
          "learning_rate": 1e-5,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 3,
          "n_accumulate": 1,
          "num_classes": 2,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "hash_name": HASH_NAME,
          "_wandb_kernel": "deb",
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{HASH_NAME}-Baseline'

# Mean pooling class definition
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# Model class definition
class HP_Model(nn.Module):
    def __init__(self, model_name):
        super(HP_Model, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CONFIG['num_classes'])  # Define CONFIG correctly

    def forward(self, ids, mask):
        out = self.model(input_ids=ids, attention_mask=mask, output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

# Initialize the model - replace 'model_name' with the actual model you used (like 'bert-base-uncased')
model_name = 'microsoft/deberta-v3-base'
model = HP_Model(model_name)

# Load the saved weights
model_path = 'kmyc4k23vc47-Loss-Fold-0.bin'
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# Set the model to evaluation mode
model.eval()

print("Model loaded and ready for inference.")




Model loaded and ready for inference.


In [7]:
import pandas as pd
td_file_path = "/fp/projects01/ec307/ec-krimhau/TD_dataset"
td_file_name = "TD_Chromium_dataset_clean.csv"
TD_dataset = pd.read_csv(f"{td_file_path}/{td_file_name}", index_col = 0)
!ls /fp/projects01/ec307/ec-krimhau/TD_dataset

TD_Chromium_dataset_clean.csv  TD_dataset_clean.csv  TD_jira_TD_dataset.csv


In [8]:
TD_dataset = pd.read_csv(f"{td_file_path}/{td_file_name}", index_col = 0)
TD_dataset

Unnamed: 0,text,label
0,network throttling does not throttle uploads u...,1
1,parallelize test execution to speed up buildbo...,1
2,netfilter switch to pure pullbased filter api ...,1
3,huge animated gifs can lead to scroll jank use...,1
4,issues with pdf viewer and iframe useragent w...,1
...,...,...
1495,wtperf multi btree wtperf dump core mac issues...,0
1496,scrub dirty page rather evicting single page r...,1
1497,include src include wiredtigerext h problemati...,0
1498,java freed memory overwrite handle close cause...,1


In [None]:
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer


df = TD_dataset

# Dataset class definition
class TDDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.texts = dataframe['text'].values  # Accessing the 'text' column
        self.labels = dataframe['label'].values  # Optionally if you need labels for some purpose
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Ensure 'model_name' is defined correctly

# Create dataset
test_dataset = TDDataset(df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the prediction function
def predict(model, data_loader):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            ids = batch['ids'].to(torch.device('cpu'))  # Make sure the device matches your model's device
            mask = batch['mask'].to(torch.device('cpu'))
            outputs = model(ids, mask)
            predictions.append(outputs)
    
    return predictions

# Perform inference
model.to(torch.device('cpu'))  # Adjust as necessary for GPU
test_predictions = predict(model, test_loader)

# Display the predictions
print(test_predictions)
