In [None]:
#importing required libraries

import numpy as np
import pandas as pd
import tarfile
import os
import gc
import random

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score


import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D,Flatten
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, Callback, ReduceLROnPlateau

# pd.set_option('display.max_colwidth', -1)
# pd.set_option('display.max_rows',5000)
# from tqdm.autonotebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install transformers



In [None]:
#creating bert from transformers and loading pretrained weigts

import transformers as trf
#loading the model
bert = trf.TFBertModel.from_pretrained('bert-base-uncased')

#creating tokenizer
tokenizer = trf.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
                                            #  ,padding=True, truncation=True, return_tensors="pt", max_length = 100)

In [None]:
#Function to create sentence embedding

def bert_embedding(txt):
  idx = tokenizer.encode(txt) #creating tokens
  idx = np.array(idx)[None,:] #converting 2d array

  emb = bert(idx) #bert layer
  hidden = np.array(emb[0][0]) #batch output of last_hidden_state

  sent_emb = hidden.mean(0) # creating mean vector
  return sent_emb

In [131]:
num_sentences = 1000

training_data = pd.read_excel('train.xlsx', nrows=num_sentences)

In [132]:
print(training_data.shape)
print(training_data['class_name'].value_counts())

(1000, 3)
class_name
Backend          535
Frontend         426
Security          21
Documentation     13
Performance        5
Name: count, dtype: int64


In [133]:
sent_matrix = np.array([bert_embedding(text) for text in training_data['report']])
sent_matrix.shape

(1000, 768)

In [134]:
Frontend = ['Dropdown menu fails to appear when clicking on user profile.',
             'Button alignment issue on mobile devices']

Backend = ['Server returns 500 error when processing large data sets.',
          'Database query returns inconsistent results for user authentication.']

Security = ['SQL injection vulnerability found in login form.',
            'Unencrypted user passwords stored in database.']

Documentation = [ 'Incorrect syntax example in API documentation for file uploads.',
                  'Missing explanation of required parameters in endpoint documentation.']

Performance = ["Application freezes for several seconds when loading large datasets.",
                "High memory usage during concurrent user sessions"]

In [135]:
#Creating BERT embedding for all these custom categories

Frontend_emb = np.array([bert_embedding(t) for t in Frontend])
print(Frontend_emb)
print(Frontend_emb.shape)

[[-0.42485604 -0.47651026  0.0335856  ... -0.15561378 -0.13877963
  -0.10416824]
 [-0.0801431   0.12522407  0.12831889 ... -0.56134814 -0.43804258
  -0.06160465]]
(2, 768)


In [136]:
### importing cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

In [137]:
### checking the cosine similarity between the given sentences for each category

for d in [Frontend,Backend,Security,Documentation,Performance]:
  d_emb = np.array([bert_embedding(t) for t in d])
  print('Cosine similarity of given samples: {}'.format(cosine_similarity(d_emb[0][None,:],d_emb[1][None,:])))
  # cosine scores are > 0.5, hence we have good examples per category. We can use them to create our custom labeled data set.

Cosine similarity of given samples: [[0.6053216]]
Cosine similarity of given samples: [[0.87106407]]
Cosine similarity of given samples: [[0.7931422]]
Cosine similarity of given samples: [[0.89904976]]
Cosine similarity of given samples: [[0.7773626]]


In [138]:
# creatin BERT embedding for all categories

Frontend_emb = np.array([bert_embedding(t) for t in Frontend])
Backend_emb = np.array([bert_embedding(t) for t in Backend])
Security_emb = np.array([bert_embedding(t) for t in Security])
Documentation_emb = np.array([bert_embedding(t) for t in Documentation])
Performance_emb = np.array([bert_embedding(t) for t in Performance])

In [139]:
#creating a df to store cosine scores

cosine_score = pd.DataFrame(columns=['id','Frontend','Backend','Security','Documentation','Performance'])
cosine_score['id'] = range(len(sent_matrix))
cosine_score['Frontend'] = cosine_similarity(sent_matrix,Frontend_emb.mean(0)[None,:])
cosine_score['Backend'] = cosine_similarity(sent_matrix,Backend_emb.mean(0)[None,:])
cosine_score['Security'] = cosine_similarity(sent_matrix,Security_emb.mean(0)[None,:])
cosine_score['Documentation'] = cosine_similarity(sent_matrix,Documentation_emb.mean(0)[None,:])
cosine_score['Performance'] = cosine_similarity(sent_matrix,Performance_emb.mean(0)[None,:])

cosine_score.head()

Unnamed: 0,id,Frontend,Backend,Security,Documentation,Performance
0,0,0.700643,0.627749,0.707164,0.688959,0.681867
1,1,0.722607,0.749043,0.747451,0.747848,0.738706
2,2,0.593143,0.706673,0.641489,0.71549,0.660776
3,3,0.72416,0.762576,0.770574,0.70413,0.714056
4,4,0.690119,0.83107,0.800282,0.816814,0.785711


In [140]:
#creating labels according to the highest cosine_score

cosine_score['label'] = cosine_score[['Frontend','Backend','Security','Documentation','Performance']].idxmax(axis=1) #finding the column which has maximum value and retunrning the column name (this becomes the label for the text)
label_df = cosine_score

del cosine_score #deleting as we dont need this df anymore
gc.collect()

label_df.head()

Unnamed: 0,id,Frontend,Backend,Security,Documentation,Performance,label
0,0,0.700643,0.627749,0.707164,0.688959,0.681867,Security
1,1,0.722607,0.749043,0.747451,0.747848,0.738706,Backend
2,2,0.593143,0.706673,0.641489,0.71549,0.660776,Documentation
3,3,0.72416,0.762576,0.770574,0.70413,0.714056,Security
4,4,0.690119,0.83107,0.800282,0.816814,0.785711,Backend


In [141]:
label_df['report'] = training_data['report'][0:num_sentences] #Earlier we have defined num_sentences
# label_df.drop(['realistic','investigative','artistic','social','enterprising','conventional'],axis=1,inplace =True)
label_df = label_df[['id','report','label']]
label_df.head(20)

Unnamed: 0,id,report,label
0,0,"""For any event on my bookmarked projects"" opti...",Security
1,1,Switch to using full l10n id's in urlbar,Backend
2,2,Consider removing hasicon property to simplify...,Documentation
3,3,Method to obtain current URL from WebBrowserEd...,Security
4,4,Fix: migration fails in MS SQL-Server,Backend
5,5,Searching for issue number with REST API redir...,Frontend
6,6,[Viewers] [JFace] Use exceptions instead of as...,Documentation
7,7,[10] Provide isDenotable API on ITypeBinding t...,Documentation
8,8,JFace Action should support drop-down AND chec...,Documentation
9,9,Can not preview video,Frontend


In [142]:
label_df.to_csv('output.csv', index=False)

In [143]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Read true output and predicted output CSV files into pandas DataFrames
true_output_df = pd.read_excel('train.xlsx', nrows=num_sentences)
predicted_output_df = pd.read_csv('output.csv')

# Merge the DataFrames based on the 'report' column
merged_df = pd.merge(true_output_df, predicted_output_df, on='report', suffixes=('_true', '_predicted'))

# Extract true labels and predicted labels
true_labels = merged_df['label_true']
predicted_labels = merged_df['label_predicted']

# Calculate accuracy metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

# Display the calculated metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)



Accuracy: 0.1873822975517891
Precision: 0.586211144594041
Recall: 0.1873822975517891
F1-score: 0.25193555954594704
