In [25]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from crf import CRF

In [26]:
import pandas as pd
import numpy as np

class NERDataset(Dataset):
    
    def __init__(self,dataframe,word_to_idx,tag_to_idx,max_length):
        
        self.sentences = []
        self.labels = []
        self.tag_to_idx = tag_to_idx
        self.data = dataframe

        
        grouped = self.data.groupby("id")
        
        for _,grouped in grouped:
            
            sentence = [word_to_idx[word] for word in grouped['word'].tolist()]
            labels_ = []
            for tag in grouped['tag'].tolist():
                labels_.append(tag_to_idx[tag])
                
            remaining_len = max_length - len(sentence)
            # print(remaining_len,max_length,len(sentence))
            for i in range(remaining_len):
                sentence.append(word_to_idx["<E>"])
                labels_.append(tag_to_idx["O"])
            self.sentences.append(sentence)
            # print(np.unique(self.sentences))
            self.labels.append(labels_)
        # print(np.unique([ len(sent) for sent in self.sentences]))
            
    def __len__(self):
        
        return len(self.sentences)
    
    def __getitem__(self,idx):
        
        return (self.sentences[idx],self.labels[idx])
    
    def collate_fn(self,batch):
        
        sentences,labels = zip(*batch)
        max_len = max(len(s) for s in sentences)
        
        padded_sentences = []
        padded_labels = []
        
        for sentence,label in zip(sentences,labels):
            
            padded_sentences.append(sentence+[0]*(max_len-len(sentence)))
            padded_labels.append(label +[0]*(max_len-len(label)))
            
        padded_sentences = torch.tensor(padded_sentences,dtype=torch.long)
        padded_labels = torch.tensor(padded_labels,dtype=torch.long)
        
        return (padded_sentences,padded_labels)
        

In [36]:
df = pd.read_csv("final_data_v1.csv",index_col=None)
unique_words = sorted(list(df['word'].unique()))

word_to_idx = {word:i for i,word in enumerate(unique_words)}
idx_to_word = {i:word for i,word in enumerate(unique_words)}

# for appending <E> : extra
word_to_idx["<E>"]=len(word_to_idx)
idx_to_word[len(word_to_idx)]="<E>"

group_counts = df.groupby('id').size()
# print(group_counts)
# max sentence length
max_sentence_len = group_counts.max()
print(max_sentence_len)

size = df.shape[0]
split = int(size*0.8)

tag_to_idx = {
    
    "O":0,
    "B-ro":1,
    "I-ro":2,
    "B-cm":3,
    "I-cm":4,
    "B-sk":5,
    "I-sk":6
}
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

train_dataset = NERDataset(df[:split],word_to_idx,tag_to_idx,max_sentence_len)
test_dataset = NERDataset(df[split:],word_to_idx,tag_to_idx,max_sentence_len)

train_dataloader = DataLoader(train_dataset,32,shuffle=True,collate_fn=train_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset,32,shuffle=True,collate_fn=test_dataset.collate_fn)

553


In [37]:
df

Unnamed: 0.1,Unnamed: 0,id,word,tag
0,0,0,Title,O
1,1,0,:,O
2,2,0,ai,B-ro
3,3,0,researcher,I-ro
4,4,0,Company,O
...,...,...,...,...
9491471,9491471,29999,big,O
9491472,9491472,29999,?,O
9491473,9491473,29999,Apply,O
9491474,9491474,29999,today,O


In [27]:
# from TorchCRF import CRF


class BILSTM_CRF(nn.Module):
    def __init__(self,vocab_size,tagset_size,num_layers=2,embedding_dim=100,hidden_dim=128,verbose=False,device='cuda'):
        super(BILSTM_CRF,self).__init__()
        
        self.verbose = verbose
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tagset_size = tagset_size
        self.num_layers = num_layers
        
        self.embedding_layer = nn.Embedding(vocab_size,embedding_dim)
        self.lstm = nn.LSTM(embedding_dim,hidden_dim//2,num_layers=1,bidirectional=True)
        
        self.hidden2tag = nn.Linear(hidden_dim,tagset_size)
        if device=='cuda':
            flag=True
        else:
            flag=False
            
        self.crf = CRF(tagset_size,use_gpu=flag)
        
        
    def forward(self,sentences,tags=None):
        
        embeds = self.embedding_layer(sentences)
        if self.verbose:
            print(f"embedding layer shape: {embeds.shape}")
            
        mask = sentences != 0  # Assuming padding index is zero
        
        batch_size = sentences.size(1)
        h_0 = torch.zeros(self.num_layers,batch_size,self.hidden_dim//2).to(sentences.device)
        c_0 = torch.zeros(self.num_layers,batch_size,self.hidden_dim//2).to(sentences.device)
        
        lstm_out,_ = self.lstm(embeds,(h_0,c_0))
        
        if self.verbose:
            print(f"lstm layer shape: {lstm_out.shape}")
            
        emissions = self.hidden2tag(lstm_out)
        if self.verbose:
            print(f"emissions layer shape: {emissions.shape}")
        
        if tags is not None:

            loss = -self.crf(emissions,tags,mask=mask.byte())
            tag_seq = self.crf.viterbi_decode(emissions,mask=mask.byte())
            
            if self.verbose:
                print(f"crf emission layer shape: {loss.shape}")
            return (loss,tag_seq)
        else:
            tag_seq = self.crf.viterbi_decode(emissions,mask=mask.byte())
            if self.verbose:
                print(f"crf emission layer shape: {len(tag_seq)}")
            return tag_seq

In [16]:
vocab_size = len(word_to_idx)
taget_size = len(tag_to_idx)
embedding_dim = 100
hidden_dim = 128
num_layers = 2

model = BILSTM_CRF(vocab_size,taget_size,num_layers,embedding_dim,hidden_dim)

# sentence,label = next(iter(train_dataloader))
# # sentence
# res = model(sentence[6].unsqueeze(0))

# decoded_sentence = ""
# decoded_labels = ""

# for idx in sentence[6].numpy():
#     decoded_sentence+=idx_to_word[idx]+" "
    
# for idx in res[0]:
#     decoded_labels+=idx_to_tag[idx]+" "
    
# print(decoded_sentence)
# print(decoded_labels)

In [17]:
def train_step(model,train_dataloader,optimizer,device='cuda'):
    model.train()
    train_loss=0
    predictions = []
    ground_truth = []
    total_batches = len(train_dataloader)
    for batch,(sentence,labels) in enumerate(train_dataloader):
        # print(sentence.shape)
        sentence = sentence.to(device)
        labels = labels.to(device)

        loss,output = model(sentence,labels)
        predictions.append(output)
        ground_truth.append(labels.tolist())
#         train_loss+=loss.mean().item()
        optimizer.zero_grad()
        loss = loss.mean()
        loss.backward()
        optimizer.step()

        if(batch+1)%50 == 0:
            print(f"  Batch {batch+1} / {total_batches}; Loss: {loss.item():.4f}")
#     print(predictions[0],ground_truth)
    train_acc = compute_accuracy(predictions,ground_truth)
    
    return train_acc,train_loss

# def compute_accuracy(predictions, ground_truth):
#     predictions = np.array(predictions[0])
#     ground_truth = np.array(ground_truth[0])

#     correct = predictions == ground_truth
#     average_accuracy = np.mean(correct)

#     return average_accuracy


                
# def compute_accuracy(predictions, ground_truth):
#     correct = 0
#     total_acc = 0
#     total = len(ground_truth[0])
#     print(ground_truth[0])
#     for preds, targets in zip(predictions, ground_truth):
# #         print(preds,targets)
#         temp_total= len(targets[0])
#         temp_acc = 0
#         correct=0
#         for pred, target in zip(preds[0], targets[0]):
#             if pred == target:
#                 correct += 1
        
#         total_acc += (correct/temp_total)
# #         print(total_acc,correct,temp_total)
# #     print(total)
#     return total_acc / total

def compute_accuracy(predictions, ground_truth):
    # Ensuring all elements are numpy arrays of consistent shape
    accuracies = []
    for pred, ground in zip(predictions, ground_truth):
        pred = np.array(pred[0])
        ground = np.array(ground[0])
        if pred.shape == ground.shape:
            # Perform element-wise comparison and calculate accuracy
            correct = np.sum(pred == ground)
            accuracy = correct / len(pred)
            accuracies.append(accuracy)
        else:
            # print(pred.shape,ground.shape)
            # print("error")
            pass

    average_accuracy = np.mean(accuracies) if accuracies else 0
    return average_accuracy
                
                
def test_step(model,test_dataloader,optimizer,device='cuda'):
    
    model.eval()
    total_batches = len(test_dataloader)
    
    predictions = []
    ground_truth = []
    
    with torch.no_grad():
        for sentence,labels in test_dataloader: #for each batch

            sentence = sentence.to(device)
            ground_truth.append(labels.tolist())
            labels = labels.to(device)

            output = model(sentence)
            
            predictions.append(output)
    
    acc = compute_accuracy(predictions,ground_truth)
            
#     print(f"   Test Accuracy: {acc*100:.2f}")
    return acc
    
    
def train(model,train_dataloader,test_dataloader,optimizer,epochs,device='cuda'):
    
    results = {"train_loss": [],
        "train_acc": [],
        "test_loss": [],
        "test_acc": []
    }
    
            
    for epoch in range(epochs):
        print(f"Epoch {epoch+1} :")
        train_acc,train_loss = train_step(model,train_dataloader,optimizer,device=device)
        test_acc = test_step(model,test_dataloader,optimizer,device=device)
        
        results["train_acc"] = train_acc
        results["train_loss"]=train_loss
        results["test_acc"] = test_acc
        #| Train Loss: {train_loss*100:.2f}%
        print(f"Epoch: {epoch+1} |  Train Acc: {train_acc*100:.2f}% | Test Acc: {test_acc*100:.2f}%")
        print("---------------------------------------------------------------------------------------")
        
    return results
    

In [23]:
torch.manual_seed(42) 
torch.cuda.manual_seed(42)
vocab_size = len(word_to_idx)
taget_size = len(tag_to_idx)
embedding_dim = 250
hidden_dim = 164
num_layers = 2
epochs = 10

device = "cuda" if torch.cuda.is_available() else 'cpu'
print(f"Training on {device}")

model = BILSTM_CRF(vocab_size,taget_size,num_layers,embedding_dim,hidden_dim).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)


from timeit import default_timer as timer 
start_time = timer()

model_results = train(model,train_dataloader,test_dataloader,optimizer,epochs,device)

end_time = timer()
print(f"Total training time: {end_time-start_time:.3f} seconds")


Training on cuda
Epoch 1 :
  Batch 50 / 752; Loss: 157.5845
  Batch 100 / 752; Loss: 94.4996
  Batch 150 / 752; Loss: 58.9334
  Batch 200 / 752; Loss: 48.0856
  Batch 250 / 752; Loss: 30.4945
  Batch 300 / 752; Loss: 21.0713
  Batch 350 / 752; Loss: 16.2867
  Batch 400 / 752; Loss: 19.9682
  Batch 450 / 752; Loss: 13.6588
  Batch 500 / 752; Loss: 14.0016
  Batch 550 / 752; Loss: 14.2538
  Batch 600 / 752; Loss: 13.5047
  Batch 650 / 752; Loss: 12.0423
  Batch 700 / 752; Loss: 10.8470
  Batch 750 / 752; Loss: 11.2211
Epoch: 1 |  Train Acc: 97.72% | Test Acc: 99.25%
---------------------------------------------------------------------------------------
Epoch 2 :
  Batch 50 / 752; Loss: 11.2557
  Batch 100 / 752; Loss: 8.6513
  Batch 150 / 752; Loss: 10.2185
  Batch 200 / 752; Loss: 10.0833
  Batch 250 / 752; Loss: 8.8912
  Batch 300 / 752; Loss: 8.8662
  Batch 350 / 752; Loss: 7.3540
  Batch 400 / 752; Loss: 11.4080
  Batch 450 / 752; Loss: 9.8568
  Batch 500 / 752; Loss: 7.6368
  Batch 

In [24]:
torch.save(model.state_dict(),"model_0.pt")

In [30]:
vocab_size = len(word_to_idx)
taget_size = len(tag_to_idx)
embedding_dim = 250
hidden_dim = 164
num_layers = 2
device = 'cuda'

model = BILSTM_CRF(vocab_size,taget_size,num_layers,embedding_dim,hidden_dim).to(device)

model.load_state_dict(torch.load("model_0.pt"))

sentence,labelss = next(iter(test_dataloader))
# sentence

model.eval()
with torch.no_grad():
    print(sentence[6].unsqueeze(0).shape)
    res = model(sentence[6].unsqueeze(0).to('cuda'))

    decoded_sentence = []
    decoded_labels = []
    original_labels = []
    sent = sentence[6].numpy()
    labelss = labelss[6].numpy()
    
    for idx in range(len(sent)):
        try:
            decoded_sentence.append(idx_to_word[sent[idx]])
            original_labels.append(idx_to_tag[labelss[idx]])
        except:
            pass
        # print(decoded_sentence)

    for idx in res[0]:
        decoded_labels.append(idx_to_tag[idx])
        
    for word,label,org_label in zip(decoded_sentence,decoded_labels,original_labels):
        
        if label!='O':
            
            print(word,label,org_label)

    # print(decoded_sentence)
    # print(decoded_labels)

torch.Size([1, 553])
support B-ro B-ro
engineer I-ro I-ro
tesla B-cm B-cm
tesla B-cm B-cm
tesla B-cm B-cm
support B-ro B-ro
engineer I-ro I-ro
dynamodb B-sk B-sk
support B-ro B-ro
engineer I-ro I-ro
Firebase B-sk B-sk
support B-ro B-ro
engineer I-ro I-ro
support B-ro B-ro
engineer I-ro I-ro
hypertable B-sk B-sk
support B-ro B-ro
engineer I-ro I-ro
multivariate B-sk B-sk
testing I-sk I-sk
Simulink B-sk B-sk
groovy B-sk B-sk
Microsoft B-sk B-sk
Dynamics I-sk I-sk
365 I-sk I-sk
support B-ro B-ro
engineer I-ro I-ro
support B-ro B-ro
engineer I-ro I-ro
support B-ro B-ro
engineer I-ro I-ro
support B-ro B-ro
engineer I-ro I-ro
tesla B-cm B-cm


In [31]:
def predict_labels(model,sentence):
    
    model.eval()
    with torch.no_grad():
        input_words = []
        
        for word in sentence.split():
            
            try:
                input_words.append(word_to_idx[word])
            except:
                # print(word)
                pass
            
        # print(input_words)
        remaining_len = 473 - len(sentence)
        for i in range(remaining_len):
            input_words.append(word_to_idx["<E>"])
        org_sentence = []
        for idx in input_words:
            try:
                org_sentence.append(idx_to_word[idx])
            except:
                pass
        input_words = torch.tensor(input_words)
        # print(input_words.unsqueeze(0).shape)
        res = model(input_words.unsqueeze(0).to('cuda'))
        decoded_labels = []
        for idx in res[0]:
            decoded_labels.append(idx_to_tag[idx])
        
        for word,label in zip(org_sentence,decoded_labels):
            if label!='O':
                print(word,label)
                
        

In [35]:
sentence = '''
About the job
Are you looking to elevate your cyber career? Your technical skills? Your opportunity for growth? Deloitte's Government and Public Services Cyber Practice (GPS Cyber Practice) is the place for you! Our GPS Cyber Practice helps organizations create a cyber minded culture and become stronger, faster, and more innovative. You will become part of a team that advises, implements, and manages solutions across five verticals: Strategy, Defense and Response; Identity; Infrastructure; Data; and Application Security. Our dynamic team offers opportunities to work with cutting-edge cyber security tools and grow both vertically and horizontally at an accelerated rate. Join our cyber team and elevate your career.

Work you'll do

Develop AI-based tools that drive efficiencies in service delivery.
Collaborate with a cloud development team for model integration.

The team

Deloitte's Government and Public Services (GPS) practice - our people, ideas, technology, and outcomes-is designed for impact. Serving federal, state, & local government clients as well as public higher education institutions, our team of more than 15,000 professionals brings fresh perspective to help clients anticipate disruption, reimagine the possible, and fulfill their mission promise

At Deloitte, we believe cyber is about starting things-not stopping them-and enabling the freedom to create a more secure future. Cyber Infrastructure is focused on rethinking how security is integrated across modernized infrastructure as cyber threats become more complex. If you're seeking a career implementing, architecting, and-in select cases-handling next generation controls to manage security risks and exposure, then the Cyber Infrastructure team at Deloitte is for you.

Qualifications

Required:

Bachelor's degree required.
1 year plus experience in python development.
1 year plus experience in Machine Learning experience, irrespective of the programming language
1 year plus experience or college course level learning with Python Machine Learning libraries, especially pytorch. TensorFlow and/or Keras experience are also relevant.
Must be legally authorized to work in the United States without the need for employer sponsorship, now or at any time in the future.
Must be able to obtain and maintain the required clearance for this role.
Ability to travel 0-15%, on average, based on the work you do and the clients and industries/sectors you serve.

Preferred Requirements

6 months plus of mathematics that power machine learning models, including Statistics, Probability, Linear Algebra, Calculus
6 months plus experience developing in a cloud native machine learning service such as Google Cloud Vertex AI or AWS SageMaker
6 months plus experience deploying AI/ML models in docker, cloud environments, or API.
6 months plus experience using GitHub for version control and source code management.
Prior professional services or federal consulting experience

Information for applicants with a need for accommodation: https://www2.deloitte.com/us/en/pages/careers/articles/join-deloitte-assistance-for-disabled-applicants.html
'''

predict_labels(model,sentence)

cyber B-ro
security I-ro
service B-cm
cloud B-ro
model B-sk
public B-sk
secure B-sk
across B-sk
infrastructure I-sk
python B-sk
the B-sk
programming I-sk
language I-sk
TensorFlow B-sk
Keras I-sk
machine B-sk
learning I-sk
cloud B-ro
machine B-sk
learning I-sk
service I-sk
Google B-sk
Cloud I-sk
cloud B-ro
control B-sk


In [22]:
print(len(unique_words))

1647
