In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from sklearn.preprocessing import LabelEncoder

In [29]:
df = pd.read_csv('QAs_Category_SubCategory_Servicenow.csv')
df.head()

Unnamed: 0,Category,Subcategory,Question,Answer
0,Inquiry/Help,Antivirus,How to update the antivirus definition files m...,"To manually update antivirus definition files,..."
1,Inquiry/Help,Antivirus,What to do if the antivirus scan is taking too...,"If the antivirus scan is prolonged, check syst..."
2,Inquiry/Help,Antivirus,How to troubleshoot if the antivirus is blocki...,If the antivirus is blocking a legitimate appl...
3,Inquiry/Help,Antivirus,What steps to take if the antivirus is not upd...,If the antivirus fails to update automatically...
4,Inquiry/Help,Antivirus,How to perform a manual antivirus scan on a sp...,"To manually scan a specific folder or file, op..."


In [30]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.head()

Unnamed: 0,Category,Subcategory,Question,Answer
51,Hardware,CPU,What to do if the CPU fan is making unusual no...,"If the fan is noisy, check for obstructions, c..."
35,Software,Email,"What steps to take when encountering ""Email No...","If the software is not responding, close unnec..."
118,Network,DHCP,How to troubleshoot issues with devices not re...,"If lease releases fail, check device configura..."
60,Hardware,Disk,How to troubleshoot slow disk performance on t...,"If disk performance is slow, check for disk fr..."
161,Network,IP Address,What steps to take if devices are unable to ob...,"If address acquisition fails, verify DHCP serv..."


In [31]:
test_df.head()

Unnamed: 0,Category,Subcategory,Question,Answer
175,Network,VPN,What steps to take if VPN connections are bloc...,"If blocked, adjust firewall rules, review secu..."
180,Network,Oracle,How to troubleshoot connection issues to the O...,"If connection problems arise, check connection..."
111,Network,DHCP,What steps to take if devices are unable to ob...,"If address acquisition fails, verify DHCP serv..."
65,Hardware,Disk,How to troubleshoot issues with slow read/writ...,"If read/write speeds are slow, check for disk ..."
101,Hardware,Mouse,What steps to take if the mouse buttons are no...,"If mouse buttons are malfunctioning, clean the..."


In [32]:
def resample_data(df):
    categories = df['Category'].unique()
    resampled_df = pd.DataFrame()

    for category in categories:
        category_df = df[df['Category'] == category]
        category_resampled = resample(category_df, replace=True, n_samples=len(df['Category']), random_state=42)
        resampled_df = pd.concat([resampled_df, category_resampled])

    return resampled_df

train_df_resampled = resample_data(train_df)
train_df_resampled.head()

Unnamed: 0,Category,Subcategory,Question,Answer
52,Hardware,CPU,How to troubleshoot CPU performance issues?,"If facing performance problems, check for back..."
89,Hardware,Memory,What steps to take if the system experiences r...,"If reboots occur, run memory tests, check for ..."
81,Hardware,Memory,What steps to take if the system is not recogn...,"If memory recognition issues arise, reseat RAM..."
74,Hardware,Keyboard,How to troubleshoot issues with multimedia key...,"If multimedia keys do not work, check for driv..."
96,Hardware,Monitor,How to resolve issues with the monitor showing...,"If ""Out of Range"" messages appear, adjust reso..."


In [33]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_data(df, max_length=128):
    input_ids = []
    attention_masks = []

    for index, row in df.iterrows():
        question = row['Question']
        encoded_dict = tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

train_input_ids, train_attention_masks = tokenize_data(train_df_resampled)
test_input_ids, test_attention_masks = tokenize_data(test_df)
train_input_ids, train_attention_masks, test_input_ids, test_attention_masks

(tensor([[ 101, 2129, 2000,  ...,    0,    0,    0],
         [ 101, 2054, 4084,  ...,    0,    0,    0],
         [ 101, 2054, 4084,  ...,    0,    0,    0],
         ...,
         [ 101, 2054, 2000,  ...,    0,    0,    0],
         [ 101, 2054, 4084,  ...,    0,    0,    0],
         [ 101, 2129, 2000,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[ 101, 2054, 4084,  ...,    0,    0,    0],
         [ 101, 2129, 2000,  ...,    0,    0,    0],
         [ 101, 2054, 4084,  ...,    0,    0,    0],
         ...,
         [ 101, 2054, 2000,  ...,    0,    0,    0],
         [ 101, 2129, 2000,  ...,    0,    0,    0],
         [ 101, 2129, 2000,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
 

In [34]:
label_encoder = LabelEncoder()

train_df_resampled['encoded_category'] = label_encoder.fit_transform(train_df_resampled['Category'])
test_df['encoded_category'] = label_encoder.transform(test_df['Category'])

train_dataset = TensorDataset(train_input_ids, train_attention_masks, torch.tensor(train_df_resampled['encoded_category'].values))
test_dataset = TensorDataset(test_input_ids, test_attention_masks, torch.tensor(test_df['encoded_category'].values))

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
epochs = 3

for epochs in range(epochs):
    model.train()

    for batch in train_dataloader:
        optimizer.zero_grad()

        inputs = batch[0].to(torch.int64)
        masks = batch[1].to(torch.int64)
        labels = batch[2].to(torch.int64) 

        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

model.eval()

with torch.no_grad():
    all_labels = []
    all_predictions = []

    for batch in test_dataloader:
        inputs = batch[0].to(torch.int64)
        masks = batch[1].to(torch.int64)
        labels = batch[2].to(torch.int64)

        outputs = model(inputs, attention_mask=masks)
        predictions = torch.argmax(outputs.logits, dim=1)

        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions.cpu().numpy())

print(classification_report(all_labels, all_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00         7
           3       1.00      1.00      1.00        13
           4       1.00      1.00      1.00         3

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

