In [14]:
"""Transformers Test (working, with improvements).ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1vwQneJX92WYgVynrmhDm9RINVzKDtSwa
"""

'Transformers Test (working, with improvements).ipynb\n\nAutomatically generated by Colab.\n\nOriginal file is located at\n    https://colab.research.google.com/drive/1vwQneJX92WYgVynrmhDm9RINVzKDtSwa\n'

In [15]:
# Install required packages
!pip install transformers scikit-learn nltk sentence-transformers

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer, util
import torch

In [32]:
# Configure the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the data from Excel
# Original data

folder_path = '/content/drive/My Drive/Additional Education/NJIT/Classes/DS677-850 Deep Learning/Project/Work Data'

# Synthetic data
synthetic_data_file_path = f'{folder_path}/labeled_synthetic_data (specific questions and answers).xlsx'

data = pd.read_excel(synthetic_data_file_path)

# Parse dates in 'CreatedOn' column
# data['CreatedOn'] = pd.to_datetime(data['CreatedOn'], format='%m/%d/%Y')


In [33]:
data.dropna(subset=['LongText'], how='all', inplace=True)

In [34]:
data

Unnamed: 0.1,Unnamed: 0,CreatedOn,Notification,OrderNum,ShortText,FLOC,LongText,Question,Answer
0,0,2019-03-15,30761040,90231415,Abrupt decrease in 18B Pump pressure,S4CW -4CWE7,* **************Incident Description**********...,What immediate actions were recommended follow...,Isolate 18B Pump and investigate for internal ...
1,1,2020-11-18,30958076,80230196,Reduced output from 5B Generator,S5GN -5GNE2-GENR,* **************Incident Description**********...,What was found as the cause of reduced perform...,The advanced diagnostics found loose connectio...
2,2,2018-09-15,30751256,80212345,Rapid frequency change in Pump 18A,S4CW -4CWE5,* **************Incident Description**********...,What was the immediate action recommended afte...,Immediate action is to isolate and shut down p...
3,3,2019-11-17,30958230,80523058,Unusual noise from 3B Pump motor,S5CW -5CWE2,* **************Incident Description**********...,What was the immediate action recommended afte...,Inspect filtration systems and remove blockage...
4,4,2016-09-15,30751904,80225619,Fuse M87 tripped,S4EL -4EL1746,* **************Incident Description**********...,What was the result of the electrical analysis...,The electrical analysis of M87 circuit was con...
...,...,...,...,...,...,...,...,...,...
477,477,2018-12-27,40732176,90234712,Overheating in Converter G4,S4AC -4ACE7-TG4,* **************Incident Description**********...,What was the initial response to the overheati...,Emergency protocol S4.OP-AB.AC-0002 was initia...
478,478,2020-12-12,30991234,80298250,18A Motor Overheating observed,S4CW -4CWE2,* **************Incident Description**********...,What were the immediate actions and recommende...,Immediate inspection and necessary maintenance...
479,479,2019-04-30,30987450,80239876,Unusual vibration in 18B Blower,S5BL -3BLE6,* **************Incident Description**********...,What measures were taken to correct the issue ...,Blower has been inspected and the misalignment...
480,480,2021-09-15,30985240,80589752,Abnormal vibration in Fan 5A,S5AI -5AIE3,* **************Incident Description**********...,What was the cause of the abnormal vibrations ...,The abnormal vibrations were due to a misalign...


In [35]:
data["LongText"]

0      * **************Incident Description**********...
1      * **************Incident Description**********...
2      * **************Incident Description**********...
3      * **************Incident Description**********...
4      * **************Incident Description**********...
                             ...                        
477    * **************Incident Description**********...
478    * **************Incident Description**********...
479    * **************Incident Description**********...
480    * **************Incident Description**********...
481    * **************Incident Description**********...
Name: LongText, Length: 482, dtype: object

In [36]:
# Configure the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Path to the directory where the model and tokenizer are saved
model_path = "/content/drive/My Drive/Additional Education/NJIT/Classes/DS677-850 Deep Learning/Project/Work Data/fine_tuned_bert_model"

# Load the tokenizer/model from tuned model that was saved locally
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForQuestionAnswering.from_pretrained(model_path)

# Alternatively load fresh pretrained model
#tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

model.to(device)  # Ensure the model is on the correct device

# Load and prepare the context embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embed_model.to(device)  # Ensure the embedding model is on the correct device

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [37]:
# Converts the question into an embedding space
# Then finds the entries with long text that has the highest coside similarity in this embedding space
# Pick the top k entries based on these embeddings

def find_relevant_context(question, df, top_k=3):
    """
    Uses semantic search to find the most relevant contexts.
    """
    question_embedding = embed_model.encode(question, convert_to_tensor=True).to(device)
    df['context_embeddings'] = df['LongText'].apply(lambda text: embed_model.encode(text, convert_to_tensor=True).to(device))
    similarities = [util.pytorch_cos_sim(question_embedding, emb).item() for emb in df['context_embeddings']]
    top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:top_k]
    return df.iloc[top_indices]

In [38]:
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits

    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
    return answer.replace("[CLS]", "").replace("[SEP]", "").strip(), answer_start, answer_end, inputs['input_ids'][0]

# Clean way to show a processed record
def process_questions(data, question):
    """
    Process all questions and produce formatted answers.
    """
    results = []
    relevant_contexts = find_relevant_context(question, data)
    for _, row in relevant_contexts.iterrows():
        context = row['LongText']
        notification_id = row['Notification']
        short_text = row['ShortText']
        created_on = row['CreatedOn']
        answer = answer_question(question, context)
        if not answer:
            answer = "No specific answer found."
        formatted_answer = f"Not. {notification_id}, {short_text} ({created_on}) - {answer}"
        results.append(formatted_answer)
    return results


In [39]:
def process_questions(data, question):
    results = []
    relevant_contexts = find_relevant_context(question, data)
    for _, row in relevant_contexts.iterrows():
        context = row['LongText']
        notification_id = row['Notification']
        short_text = row['ShortText']
        created_on = row['CreatedOn']
        answer, answer_start, answer_end, input_ids = answer_question(question, context)
        if not answer:
            answer = "No specific answer found."

        # Highlight the answer within the original context
        start_index = context.lower().find(answer.lower())
        end_index = start_index + len(answer)
        highlighted_context = context[:start_index] + f"\033[1;31m{context[start_index:end_index]}\033[0m" + context[end_index:]

        formatted_result = f"Notification: {notification_id}\nShort Text: {short_text}\nCreated On: {created_on}\nQuestion: {question}\nContext: {highlighted_context}\nAnswer: {answer}\n"
        results.append(formatted_result)
    return results

In [40]:
# Example question
question = "what is the cause of the high vibrations on the condenser pump?"
all_results = process_questions(data, question)
all_results

["Notification: 30978512\nShort Text: Unexpected vibration in 34A pump\nCreated On: 2021-09-10\nQuestion: what is the cause of the high vibrations on the condenser pump?\nContext: * **************Incident Description************** * Notification 000030978512 Details * *********************************************************** * 09/10/2021 07:45:17 INCIDENT REPORT (INCIDENT REPORT) * Report submitted by: Emily Smith ENTITY\\INCIDENT: * * CONDITION DESCRIPTION/LOCATION (THE INAPPROPRIATE ACTION AND ITS NEGATIVE EFFECT/INCLUDE A DETAILED LOCATION DESCRIPTION): * While monitoring operations, abnormal vibrations were noted on  the 34A pump. Upon visual inspection, no visible factors like leaks or visible damage  were found that could cause such abnormality. Measurements taken showed an increase in vibration levels by 20% compared to normal operation. * * ACTIVITIES, PROCESSES, PROCEDURES INVOLVED: * Vibration monitoring of operating pumps, S4.OP.AB-CW-0012 *  * WHY DID CONDITION HAPPEN?: *

In [41]:
for result in all_results:
    print(result)

Notification: 30978512
Short Text: Unexpected vibration in 34A pump
Created On: 2021-09-10
Question: what is the cause of the high vibrations on the condenser pump?
Context: * **************Incident Description************** * Notification 000030978512 Details * *********************************************************** * 09/10/2021 07:45:17 INCIDENT REPORT (INCIDENT REPORT) * Report submitted by: Emily Smith ENTITY\INCIDENT: * * CONDITION DESCRIPTION/LOCATION (THE INAPPROPRIATE ACTION AND ITS NEGATIVE EFFECT/INCLUDE A DETAILED LOCATION DESCRIPTION): * While monitoring operations, abnormal vibrations were noted on  the 34A pump. Upon visual inspection, no visible factors like leaks or visible damage  were found that could cause such abnormality. Measurements taken showed an increase in vibration levels by 20% compared to normal operation. * * ACTIVITIES, PROCESSES, PROCEDURES INVOLVED: * Vibration monitoring of operating pumps, S4.OP.AB-CW-0012 *  * WHY DID CONDITION HAPPEN?: * The ex