# Project: BERT classify the sentence whether contains software related words

In [None]:
import json
import xmltodict
import re
import os
from transformers import AutoTokenizer, AutoModel,AdamW
import numpy as np
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader,WeightedRandomSampler
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
import random

In [None]:
from sklearn.metrics import precision_score, recall_score,f1_score
from ignite.engine import Engine, Events
from ignite.metrics import Accuracy, Loss, RunningAverage, Precision, Recall
from ignite.handlers import ModelCheckpoint, EarlyStopping
from ignite.contrib.handlers import ProgressBar
from ignite.utils import manual_seed
from ignite import metrics
from ignite.metrics import precision

In [None]:
#uncompress zip
import zipfile
with zipfile.ZipFile('unlabeled_data.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

# Read File

In [None]:
xmipath='./train/'
xmifilelist=os.listdir(xmipath)

In [None]:
Cleanxmifilelist=[]
for x in xmifilelist:
    if ".xmi" in x:
        Cleanxmifilelist.append(x)

In [None]:
#transfer to json file
for i in range(len(Cleanxmifilelist)):
    temp=xmipath+Cleanxmifilelist[i]
    with open(temp) as xml_file:
        data_dict = xmltodict.parse(xml_file.read())
        xml_file.close()
        json_data = json.dumps(data_dict)
        tempname=re.sub(".xmi",".json",Cleanxmifilelist[i])
        temppath="./newTrain_json/"
        jsonpath=temppath+tempname
        with open(jsonpath, "w") as json_file:
            json_file.write(json_data)
            json_file.close()

In [None]:
#Token tool
from nltk import word_tokenize
from nltk.tokenize import MWETokenizer

def multiword_tokenize(text, mwe):
    # Initialize the MWETokenizer
    protected_tuples = [word_tokenize(word) for word in mwe]
    protected_tuples_underscore = ['_'.join(word) for word in protected_tuples]
    tokenizer = MWETokenizer(protected_tuples)
    # Tokenize the text.
    tokenized_text = tokenizer.tokenize(word_tokenize(text))
    # Replace the underscored protected words with the original MWE
    for i, token in enumerate(tokenized_text):
        if token in protected_tuples_underscore:
            tokenized_text[i] = mwe[protected_tuples_underscore.index(token)]
    return tokenized_text

mwe = ['20-30', 'my bike']
a=multiword_tokenize('Yes 20-30 minutes a day on my bike, it works great!!', mwe)
print(a)

In [None]:
jsonpath='./newTrain_json/'
jsonfilelist=os.listdir(jsonpath)
jsonfilelist.remove('.ipynb_checkpoints')

In [None]:
jsonpathlist=[]
for x in jsonfilelist:
    jtemp=jsonpath+x
    jsonpathlist.append(jtemp)

In [None]:
#test all
import pandas as pd
Alldf=pd.DataFrame(columns=['Docid', 'Sentencenumber', 'Sentence', 'Bilabel', 'SFname'])
for x in jsonpathlist:
    df=pd.DataFrame()
    with open (x) as jsonfile:
        jdata=json.load(jsonfile)
        sfword=[]
        Sentence=[]
        Bilabel=[]
        SFname=[]
        Docid=[]
        Fullsentence=[]
        Sentencenumber=[]
        if type(jdata['xmi:XMI']['typesystem:ClampNameEntityUIMA'])==dict:
            begin=int(jdata['xmi:XMI']['typesystem:ClampNameEntityUIMA']['@begin'])
            end=int(jdata['xmi:XMI']['typesystem:ClampNameEntityUIMA']['@end'])
            sfword.append(jdata['xmi:XMI']['cas:Sofa']["@sofaString"][begin:end])               
        elif type(jdata['xmi:XMI']['typesystem:ClampNameEntityUIMA'])==list:
            for i in range(len(jdata['xmi:XMI']['typesystem:ClampNameEntityUIMA'])):
                begin=int(jdata['xmi:XMI']['typesystem:ClampNameEntityUIMA'][i]['@begin'])
                end=int(jdata['xmi:XMI']['typesystem:ClampNameEntityUIMA'][i]["@end"])
                sfword.append(jdata['xmi:XMI']['cas:Sofa']["@sofaString"][begin:end])
        sentences=jdata['xmi:XMI']['cas:Sofa']["@sofaString"].split(".\n")
        for j in range(len(sentences)):
            
            tksentence=multiword_tokenize(sentences[j],sfword)
            if len(list(set(tksentence) & set(sfword)))==0: 
                Bilabel.append("not software")
                SFname.append("")
            else:
                Bilabel.append("software")
                sfinsentence=list(set(tksentence) & set(sfword))
                sfstring=",".join(sfinsentence)
                SFname.append(sfstring)
            tkstring=",".join(tksentence)
            Sentence.append(tkstring)
            Fullsentence.append(" ".join(tksentence))
            idname=re.sub("./newTrain_json/","",x)
            idname=re.sub(".json","",idname)       
            Docid.append(idname)
            Sentencenumber.append(j)

        df["Docid"]=Docid
        df["Docid"]=df["Docid"].astype(str)
        df["Sentencenumber"]=Sentencenumber
        df["Sentencenumber"]=df["Sentencenumber"].astype(str)
        df["Sentence"]=Sentence
        df["Bilabel"]=Bilabel
        df["SFname"]=SFname
        df["original_sent"]=Fullsentence
        df.drop(df[df['Sentence'] ==''].index, inplace = True)
    
    Alldf=Alldf.append(df,ignore_index=True)
    
Alldf['new_index']=Alldf['Docid']+Alldf['Sentencenumber']

In [None]:
Alldf['label']=np.where(Alldf['Bilabel']=='software',1,0)

In [None]:
Alldf.head()

# Train and Validation Dataset

In [None]:
train_inputs,validation_inputs= train_test_split(Alldf,random_state=999,test_size=0.1)

In [None]:
traindf=train_inputs[['new_index','original_sent','label']]
validationdf=validation_inputs[['new_index','original_sent','label']]

In [None]:
#create train label

train_label={}

for i in range(len(train_inputs)):
    label=[]
    index=train_inputs.new_index.values[i]
    sfname=train_inputs.SFname.values[i]
    if len(sfname)>0:
        newsfname=sfname.split()
        sfname=train_inputs.SFname.values[i].split()
        tempsent=train_inputs.original_sent.values[i].split()
        intlist=[0]*len(tempsent)
        both = list(set(sfname).intersection(tempsent))
        indices_B = [tempsent.index(x) for x in both]
        for i in indices_B:
            intlist[i]=1
        label=intlist
    elif len(sfname)==0:
        label=[0]*len(train_inputs.original_sent.values[i].split())
    train_label[index]=label

In [None]:
#create train label

train_label={}

for i in range(len(train_inputs)):
    label=[]
    index=train_inputs.new_index.values[i]
    sfname=train_inputs.SFname.values[i]
    if len(sfname)>0:
        newsfname=sfname.split()
        sfname=train_inputs.SFname.values[i].split()
        tempsent=train_inputs.original_sent.values[i].split()
        intlist=[0]*len(tempsent)
        both = list(set(sfname).intersection(tempsent))
        indices_B = [tempsent.index(x) for x in both]
        for i in indices_B:
            intlist[i]=1
        label=intlist
    elif len(sfname)==0:
        label=[0]*len(train_inputs.original_sent.values[i].split())
    train_label[index]=label

# Padding and Mask

In [None]:
def paddingdict(input_dict,padingnumber,druglist):
    length=[]
    for i in input_dict.values():
        length.append(len(i))
    maxnu=np.max(length) 
    print(maxnu)
    new_dict={}
    new_dict_mask={}
    for i in druglist:
        value=input_dict.get(i)
        new_dict_mask[i]=np.array([1]*len(value)+[0]*(maxnu-len(value))) 
        try:
            if len(value)>0:
                appendlist=[padingnumber]*(maxnu-len(value))
                value=value+appendlist
                new_dict[i]=np.array(value)
        except:
            value=[padingnumber]*maxnu
            new_dict[i]=np.array(value)
    return new_dict,new_dict_mask


In [None]:
def Vpaddingdict(input_dict,padingnumber,druglist):
    length=[]
    for i in input_dict.values():
        length.append(len(i))
    maxnu=384 
    print(maxnu)
    new_dict={}
    new_dict_mask={}
    for i in druglist:
        value=input_dict.get(i)
        new_dict_mask[i]=np.array([1]*len(value)+[0]*(maxnu-len(value)))
        try:
            if len(value)>0:
                appendlist=[padingnumber]*(maxnu-len(value))
                value=value+appendlist
                new_dict[i]=np.array(value)
        except:
            value=[padingnumber]*maxnu
            new_dict[i]=np.array(value)
    return new_dict,new_dict_mask

In [None]:
#Train
new_train_label_dict, new_train_label_dict_mask=paddingdict(train_label,0,list(train_label.keys()))

In [None]:
#validation
new_validation_label_dict, new_validation_label_mask=Vpaddingdict(validation_label,0,list(validation_label.keys()))

# Model

In [None]:
class simpletrain(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset,label_dict,label_dict_mask):
        """
       
        """
        self.data = dataset
        self.label_dict=label_dict
        self.label_dict_mask=label_dict_mask
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        

        sentid= self.data.iloc[idx,0]
        sent= self.data.iloc[idx,1]
        sent_label= self.data.iloc[idx,2]
        sent=self.tokenizer(sent,padding='max_length', truncation=True, max_length=384,return_tensors="pt")
       
        
        data=(sent,sent_label)
        return data

In [None]:
class bert_rnn(nn.Module):
    def __init__(self, args):
        super(bert_rnn, self).__init__()
        self.args = args
        
        self.emb=AutoModel.from_pretrained(self.args['modelname'])
        self.emb_size=self.emb.config.hidden_size
        self.lin1 = nn.Linear(self.emb_size, 2)
        self.lin2 = nn.Linear(self.emb_size, self.args['num_labels'])
        self.dropout = nn.Dropout(self.args['drop_out'])
        
    def forward(self,data1,mask1):
        emb1 =self.emb(data1,mask1)
        ner=self.lin1(torch.relu(self.dropout(emb1['last_hidden_state'])))
        ner=torch.argmax(ner,2)
        seq_class=self.lin2(torch.relu(self.dropout(emb1['pooler_output'])))
        seq_class=torch.sigmoid(seq_class)
        
        return ner,seq_class

In [None]:
config={'modelname':'bert-base-uncased',
        'num_labels': 2,
        'drop_out':0.2,
    }

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Ignite Early stopping and check point

In [None]:
SEED = 1234
manual_seed(SEED)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
train=simpletrain(traindf,new_train_label_dict, new_train_label_dict_mask)
train_loader = DataLoader(train, batch_size=5, shuffle=True)

In [None]:
validation=simpletrain(validationdf,new_validation_label_dict, new_validation_label_mask)
validation_loader = DataLoader(validation, batch_size=2, shuffle=False)

In [None]:
cuda=True
device=3
torch.cuda.set_device(device)
model=bert_rnn(config)
model.to(device)
count_parameters(model)
params=model.parameters()
optimizer = AdamW(params,lr = 2e-5, eps = 1e-8 )

In [None]:
loss1 = nn.BCELoss()
loss2= nn.CrossEntropyLoss()

In [None]:
#early stoping
def process_function(engine,batch):
  
    model.train()
    
    optimizer.zero_grad()

    sent,sent_label=batch
   
    sent_label=sent_label.to(device)
    

    
    _,sent_pred=model(torch.squeeze(sent['input_ids']).to(device),torch.squeeze(sent['attention_mask']).to(device))
    
    
    alloss=loss2(sent_pred,sent_label.long())
    
    
    alloss.backward()
    
    optimizer.step()
    
    return alloss.item()

In [None]:
#Evaluation for Early stopping
def eval_function(engine,batch):
    model.eval()
    
    with torch.no_grad():
        sent,sent_label=batch
    
        sent_label=sent_label.to(device)
    
        
    
        _,sent_pred=model(torch.squeeze(sent['input_ids']).to(device),torch.squeeze(sent['attention_mask']).to(device)) 
        
        
        sent_pred=torch.argmax(sent_pred,1).flatten()
        
        sent_label=sent_label.flatten()

        return  sent_pred, sent_label
            


In [None]:
trainer = Engine(process_function)
validation_evaluator = Engine(eval_function)

In [None]:
RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')

In [None]:
def thresholded_output_transform_1(output):
    y_1, y_2= output
    
    return y_1, y_2

In [None]:
Precision(output_transform=thresholded_output_transform_1).attach(validation_evaluator, 'sen_precision')
Recall(output_transform=thresholded_output_transform_1).attach(validation_evaluator, 'sen_recall')

In [None]:
def score_function(engine):
    val_loss = engine.state.metrics['sen_precision']
    return val_loss
handler = EarlyStopping(patience=5, score_function=score_function, trainer=trainer)
validation_evaluator.add_event_handler(Events.COMPLETED, handler)

In [None]:
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    validation_evaluator.run(validation_loader)
    metrics = validation_evaluator.state.metrics
    sen_precision=metrics['sen_precision']
    sen_recall=metrics['sen_recall']
    print(sen_precision, sen_recall)
    pbar.log_message(
        "Validation Results - Epoch: {}  Sen_precision: {:.2f} Sen_Recall : {:.2f} ".format(engine.state.epoch,sen_precision,sen_recall))# seg_precision, , seq_recall#Seg_recall: {:.2f}, Seg_precision: {:.2f}
    pbar.n = pbar.last_print_n = 0

trainer.add_event_handler(Events.EPOCH_COMPLETED, log_validation_results)

In [None]:
pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss'])

In [None]:
pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ['loss'])

In [None]:
trainer.run(train_loader, max_epochs=100)

In [None]:
validation_evaluator.run(validation_loader)