In [1]:
import json
import pandas as pd
import numpy as np
import torch
import random

from torch import nn
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from torch.utils.data import Dataset as TorchDataset

from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset, Dataset
from typing import List, Dict
from sklearn.metrics import pairwise_distances
from dataclasses import dataclass
from typing import Optional, Tuple



In [2]:

# 加载数据
with open('all_bug_report.json', 'r') as f:
    data = json.load(f)

# 转换数据为 DataFrame
bug_report = pd.DataFrame(data)


In [3]:
bug_report.head(5)

Unnamed: 0,ID,nonbug/bug,root cause,impact,title,summary,comments
0,16,bug,semantic,unk,Comparison operators,The front end doesn t understand etc. Hilariou...,I got lost somewhere inside semantic checking ...
1,22,nonbug,,,porting the examples in python using the provi...,As in the title do you have any plan to provid...,Hi mscipio thank you for the query. We will pr...
2,29,bug,document,warning style error,in Start example typo,in Docs Let s start with a simple example is a...,in docs source introduction.rst not fix yet.\n
3,34,invalid,cannot reproduce,,Determine whether a loop should be parallelize...,ISL internally dictates if loops should be run...,can you add some description for this wsmoses ...
4,35,bug,document,operation failure,Investigate install TC instructions in docker,there are permission denied issues with git cl...,fixed by 33 \n \nasking users to do ssh keys s...


In [4]:
bug_report['nonbug/bug'] = bug_report['nonbug/bug'].fillna('invalid')
keep_causes = [
    'semantic', 'question', 'enhancement', 'feature request', 'compatibility', 'document',
    'new function', 'environment', 'memory'
]
# 使用 apply 方法检查并设置 NaN
bug_report['root cause'] = bug_report['root cause'].apply(lambda x: x if x in keep_causes else np.nan)
keep_causes = [
    'semantic', 'question', 'enhancement', 'feature request', 'compatibility', 'document',
    'new function', 'environment', 'memory'
]

# 使用 apply 方法检查并设置 NaN
bug_report['root cause'] = bug_report['root cause'].apply(lambda x: x if x in keep_causes else np.nan)
bug_report['root cause'].value_counts()



root cause
semantic           488
question           227
enhancement        116
feature request    105
compatibility       78
document            60
new function        60
environment         36
memory              14
Name: count, dtype: int64

In [5]:
keep_impacts = [
    'crash/exception','build/compilation error','wrong output',
    'operation failure','warning style error'
]
bug_report['impact'] = bug_report['impact'].replace({
    'build error': 'build/compilation error',
    'compilation error': 'build/compilation error', 
    'build/compilation failure':'build/compilation error',
    'warningstyleerror': 'warning style error',
    'warningstylerrot': 'warning style error'  ,
    'operationfailure': 'operation failure',
    'wrongoutput': 'wrong output',
})
bug_report['impact'] = bug_report['impact'].apply(lambda x: np.nan if x == '' else (x if x in keep_impacts else 'others'))
bug_report['impact'].value_counts()


impact
crash/exception            308
build/compilation error    108
others                      82
wrong output                79
operation failure           34
Name: count, dtype: int64

In [6]:
bug_report= pd.get_dummies(bug_report, columns=['impact', 'root cause', 'nonbug/bug'], dummy_na=True)
bug_report['merged_text'] = bug_report['title'] + " " + bug_report['summary']  
bug_report.head(5)


Unnamed: 0,ID,title,summary,comments,impact_build/compilation error,impact_crash/exception,impact_operation failure,impact_others,impact_warning style error,impact_wrong output,...,root cause_memory,root cause_new function,root cause_question,root cause_semantic,root cause_nan,nonbug/bug_bug,nonbug/bug_invalid,nonbug/bug_nonbug,nonbug/bug_nan,merged_text
0,16,Comparison operators,The front end doesn t understand etc. Hilariou...,I got lost somewhere inside semantic checking ...,False,False,False,True,False,False,...,False,False,False,True,False,True,False,False,False,Comparison operators The front end doesn t und...
1,22,porting the examples in python using the provi...,As in the title do you have any plan to provid...,Hi mscipio thank you for the query. We will pr...,False,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,porting the examples in python using the provi...
2,29,in Start example typo,in Docs Let s start with a simple example is a...,in docs source introduction.rst not fix yet.\n,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,in Start example typo in Docs Let s start with...
3,34,Determine whether a loop should be parallelize...,ISL internally dictates if loops should be run...,can you add some description for this wsmoses ...,False,False,False,False,False,False,...,False,False,False,False,True,False,True,False,False,Determine whether a loop should be parallelize...
4,35,Investigate install TC instructions in docker,there are permission denied issues with git cl...,fixed by 33 \n \nasking users to do ssh keys s...,False,False,True,False,False,False,...,False,False,False,False,False,True,False,False,False,Investigate install TC instructions in docker ...


In [7]:


global_model = None
global_tokenizer = None

#加载高频掩码词
def load_professional_vocab(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        vocab = json.load(f)
    # Assuming vocab is a list of words
    return set(vocab)

# 自定义分词器
class BugReportDataset(Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer: RobertaTokenizer,
                 max_length: int, professional_vocab: set, mask_prob=0.15):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.professional_vocab = professional_vocab
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.data)

    def mask_tokens(self, inputs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:


        labels = inputs.clone()
        probability_matrix = torch.full(labels.shape, self.mask_prob)
    
        # 获取专业词汇的 token 索引
        vocab_indices = []
        for word in self.professional_vocab:
            token = self.tokenizer.convert_tokens_to_ids(word)
            if token is not None:
                vocab_indices.append(token)
        if vocab_indices:
            mask_indices = torch.zeros(labels.shape, dtype=torch.bool)
            for idx in vocab_indices:
                mask_indices = mask_indices | (labels == idx)
            # 增加专业词汇的掩码概率
            probability_matrix = torch.where(mask_indices, 0.5, probability_matrix)
        
        # 不掩码特殊 token
        special_tokens_mask = self.tokenizer.get_special_tokens_mask(labels.tolist(), already_has_special_tokens=True)
        special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # 仅对掩码的 token 计算损失

      
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_tokens = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_tokens[indices_random]

        return inputs, labels


    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        title = row['merged_text']
        comments = row['comments']

       
        title_enc = self.tokenizer(title,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=self.max_length,
                                   return_tensors='pt')
    
        comments_enc = self.tokenizer(comments,
                                      truncation=True,
                                      padding='max_length',
                                      max_length=self.max_length,
                                      return_tensors='pt')
        
    
        title_input_ids, title_labels = self.mask_tokens(title_enc['input_ids'].squeeze())
        comments_input_ids, comments_labels = self.mask_tokens(comments_enc['input_ids'].squeeze())

        return {
            'title_input_ids': title_input_ids,
            'title_attention_mask': title_enc['attention_mask'].squeeze(),
            'title_labels': title_labels,
            'comments_input_ids': comments_input_ids,
            'comments_attention_mask': comments_enc['attention_mask'].squeeze(),
            'comments_labels': comments_labels
        }


class RobertaForMLMAndContrastive(nn.Module):
    def __init__(self, model_name='roberta-base', temperature=0.07):
        super(RobertaForMLMAndContrastive, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.mlm_head = RobertaMLMHead(self.roberta.config)  # 已修改
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, title_input_ids, title_attention_mask, title_labels,
                comments_input_ids, comments_attention_mask, comments_labels):
       
        title_outputs = self.roberta(input_ids=title_input_ids, attention_mask=title_attention_mask)
        title_hidden_states = title_outputs.last_hidden_state
        title_mlm_logits = self.mlm_head(title_hidden_states)

        
        comments_outputs = self.roberta(input_ids=comments_input_ids, attention_mask=comments_attention_mask)
        comments_hidden_states = comments_outputs.last_hidden_state
        comments_mlm_logits = self.mlm_head(comments_hidden_states)

        
        loss_fct = nn.CrossEntropyLoss()
        title_mlm_loss = loss_fct(title_mlm_logits.view(-1, self.roberta.config.vocab_size),
                                  title_labels.view(-1))
        comments_mlm_loss = loss_fct(comments_mlm_logits.view(-1, self.roberta.config.vocab_size),
                                     comments_labels.view(-1))
        mlm_loss = (title_mlm_loss + comments_mlm_loss) / 2

       
        title_cls = title_hidden_states[:, 0, :]  # (batch_size, hidden_size)
        comments_cls = comments_hidden_states[:, 0, :]  # (batch_size, hidden_size)

        
        title_norm = title_cls / title_cls.norm(dim=1)[:, None]
        comments_norm = comments_cls / comments_cls.norm(dim=1)[:, None]

      
        similarity_matrix = torch.matmul(title_norm, comments_norm.T) / self.temperature
        batch_size = title_input_ids.size(0)
        labels = torch.arange(batch_size).to(title_input_ids.device)

        contrastive_loss_fct = nn.CrossEntropyLoss()
        contrastive_loss_title = contrastive_loss_fct(similarity_matrix, labels)
        contrastive_loss_comments = contrastive_loss_fct(similarity_matrix.T, labels)
        contrastive_loss = (contrastive_loss_title + contrastive_loss_comments) / 2

        
        total_loss = (mlm_loss + contrastive_loss) / 2

        return total_loss, mlm_loss, contrastive_loss

class RobertaMLMHead(nn.Module):
    def __init__(self, config):
        super(RobertaMLMHead, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.GELU()
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=True)  

       
        self.decoder.bias.data.zero_()  
        self.decoder.weight = self.decoder.weight  

    def forward(self, hidden_states):
        x = self.dense(hidden_states)
        x = self.activation(x)
        x = self.LayerNorm(x)
        x = self.decoder(x)
        return x


def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()

        batch = {k: v.to(device) for k, v in batch.items()}
        loss, mlm_loss, contrastive_loss = model(
            title_input_ids=batch['title_input_ids'],
            title_attention_mask=batch['title_attention_mask'],
            title_labels=batch['title_labels'],
            comments_input_ids=batch['comments_input_ids'],
            comments_attention_mask=batch['comments_attention_mask'],
            comments_labels=batch['comments_labels']
        )
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

def main():
    #一些设置 超参数 高频词 分词器
    MODEL_NAME = 'roberta-base'
    MAX_LENGTH = 128
    BATCH_SIZE = 16
    EPOCHS = 8
    LEARNING_RATE = 5e-5
    PROFESSIONAL_VOCAB_JSON = 'frequent_words.json'  

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
    professional_vocab = load_professional_vocab(PROFESSIONAL_VOCAB_JSON)

    #数据加载
    dataset = BugReportDataset(bug_report, tokenizer, MAX_LENGTH, professional_vocab)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    # 模型
    model = RobertaForMLMAndContrastive(model_name=MODEL_NAME)
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=int(0.1 * total_steps),
                                                num_training_steps=total_steps)

    # 训练
    for epoch in range(EPOCHS):
        avg_loss = train(model, dataloader, optimizer, scheduler, device)
        print(f"Epoch {epoch+1}/{EPOCHS} - Average Loss: {avg_loss:.4f}")



    torch.save(model.state_dict(), 'roberta_bug_report_model.pt')
    tokenizer.save_pretrained('roberta_bug_report_model')

    global_model = model 
    global_tokenizer = tokenizer


if __name__ == "__main__":
    main()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/8 - Average Loss: 5.6267
Epoch 2/8 - Average Loss: 3.8556
Epoch 3/8 - Average Loss: 3.3625
Epoch 4/8 - Average Loss: 2.9597
Epoch 5/8 - Average Loss: 2.7481
Epoch 6/8 - Average Loss: 2.6038
Epoch 7/8 - Average Loss: 2.4849
Epoch 8/8 - Average Loss: 2.4415


In [8]:
file_path = 'bug_report.csv'
bug_report.to_csv(file_path, index=False)