## Importing all important libraries

In [2]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
import re
import numpy as np
from transformers import pipeline, T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor


## Step 1: Cleaning the dataset

In [None]:
## Extracting content from def_text
def split_text(row):
    string_data = row['def_text']

    split_columns = [
        'PscInspectionId', 'Deficiency/Finding', 'Description Overview', 'Immediate Causes',
        'Root Cause Analysis', 'Corrective Action', 'Preventive Action', 'Deficiency Code',
        'Detainable Deficiency'
    ]

    pattern = r"(" + "|".join(re.escape(col) for col in split_columns) + r"): (.*?)(?=\n[A-Z]|$)"
    matches = re.findall(pattern, string_data, re.DOTALL)

    result = {col: "" for col in split_columns}
    for key, value in matches:
        result[key] = value.strip()

    return [result[col] for col in split_columns]

data_type = "train"
data = pd.read_csv(f"../../data/dataset/psc_severity_{data_type}.csv")
split_columns = [
        'PscInspectionId', 'Deficiency/Finding', 'Description Overview', 'Immediate Causes',
        'Root Cause Analysis', 'Corrective Action', 'Preventive Action', 'Deficiency Code',
        'Detainable Deficiency'
    ]
split_results = data.apply(split_text, axis=1)
data[split_columns] = pd.DataFrame(split_results.to_list(), index=data.index)

## Removing unnessary columns
clean_data = data[['PscInspectionId', 'InspectionDate', 'VesselId',
       'PscAuthorityId', 'PortId', 'VesselGroup', 'age',
       'Deficiency/Finding', 'Description Overview', 'Immediate Causes',
       'Root Cause Analysis', 'Corrective Action', 'Preventive Action',
       'Deficiency Code', 'Detainable Deficiency']]

## Saving the cleaned data
clean_data.to_csv(f"../../data/dataset/cleaned_{data_type}_data.csv",index=False)

## Step 2: Using Generative AI to get 'Second unbiased opinion'

In [None]:
data_type = "train"
data = pd.read_csv(f"../data/dataset/cleaned_{data_type}_data.csv")
data['Deficiency Code'] = data['Deficiency Code'].astype(str).str.zfill(5)

## Get unique inspection records
grouped_data = data.groupby([ 'Deficiency Code',
       'PscAuthorityId', 'VesselGroup', 'Deficiency/Finding',
       'Description Overview', 'Immediate Causes', 'Root Cause Analysis',
       'Corrective Action', 'Preventive Action',
       'Detainable Deficiency',"age",'PortId']).mean().round(0).astype(int).reset_index()

## Open-source LLM
model_name = "google/flan-t5-large"  
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
generator = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)


## Prompt Engineering to get second opinion
severity_labels = ['Low', 'Medium', 'High']
processed = 1
low_count = 0
medium_count = 0
high_count = 0
total = grouped_data.shape[0]

def generate_severity_label(row, severity_labels, generator, max_new_tokens=10, temperature=0.8, top_k=50, top_p=0.95):
    global processed 
    global total
    global low_count
    global medium_count
    global high_count
    prompt = (
        "You are a Port State Control (PSC) inspector evaluating the severity of deficiencies. "
        "Based on the input parameters below, respond only with the severity rating: Low, Medium, or High. "
        "Do not include any additional text.\n\nThink step-by-step before providing the severity rating.\n\n"
        "Input Parameters:\n"
        f"- Deficiency Code: {row['Deficiency Code']}\n"
        f"- PSC Authority ID: {row['PscAuthorityId']}\n"
        f"- Port ID: {row['PortId']}\n"
        f"- Vessel Group: {row['VesselGroup']}\n"
        f"- Age: {row['age']}\n"
        f"- Deficiency/Finding: {row['Deficiency/Finding']}\n"
        f"- Description Overview: {row['Description Overview']}\n"
        f"- Immediate Causes: {row['Immediate Causes']}\n"
        f"- Root Cause Analysis: {row['Root Cause Analysis']}\n"
        f"- Corrective Action: {row['Corrective Action']}\n"
        f"- Preventive Action: {row['Preventive Action']}\n"
        f"- Detainable Deficiency: {row['Detainable Deficiency']}\n\n"
        "Severity Rating:"
    )
    print(f"Processing rows {processed}/{total}")
    processed += 1
    try:
        generated = generator(
            prompt,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            do_sample=True
        )
    
        generated_text = generated[0]['generated_text'].strip()
        print(f"Generated Text: {generated_text}")  

        # Extract the severity label using regex for reliability
        match = re.search(r'\b(Low|Medium|High)\b', generated_text, re.IGNORECASE)
        if match:
            label = match.group(1).capitalize()
            if label in severity_labels:
                if label == "Low":
                    low_count += 1
                if label == "Medium":
                    medium_count += 1
                if label == "High":
                    high_count += 1
                print(f"Extracted Label: {label}")  
                print(low_count,medium_count,high_count)
                return label
    
        print("Extraction Failed: Defaulting to 'Medium'")
        medium_count += 1
        return 'Medium'  # Default label


    except Exception as e:
        print(f"Error generating severity label for row {row.name}: {e}")
        return 'Medium'
    
grouped_data['synthetic_severity'] = grouped_data.apply(
    lambda x: generate_severity_label(x, severity_labels, generator),
    axis=1
)

## Saved second opinion data
encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
grouped_data['synthetic_severity_encoded'] = encoder.fit_transform(grouped_data[['synthetic_severity']]).astype(int)
grouped_data.to_csv('./augmented_data2.csv',index=False)

## Step 3: Train AutoML model: AutoGluon

In [None]:
data_type = "train"
data = pd.read_csv(f"../../data/dataset/cleaned_{data_type}_data.csv")
augmented_data = pd.read_csv(f"../augmented_data.csv")
augmented_data = augmented_data.drop(columns=["severity_encoded", "synthetic_severity"])
augmented_data = augmented_data.rename(columns={"synthetic_severity_encoded": "severity_encoded"})
data = pd.concat([data, augmented_data])

data = data[['Deficiency Code','age', 'PscAuthorityId', 'VesselGroup', 'Deficiency/Finding', 'Description Overview', 
                'Immediate Causes', 'Root Cause Analysis', 'Corrective Action', 
                'Preventive Action', 'Detainable Deficiency', 'severity_encoded']]
data['Deficiency Code'] = data['Deficiency Code'].astype(str).str.zfill(5)


grouped_data = data.groupby([ 'Deficiency Code',
       'PscAuthorityId', 'VesselGroup', 'Deficiency/Finding',
       'Description Overview', 'Immediate Causes', 'Root Cause Analysis',
       'Corrective Action', 'Preventive Action',
       'Detainable Deficiency',"age"]).mean().round(0).astype(int).reset_index()


metric = "roc_auc_ovo"
target = 'severity_encoded'

train_data, val_data = train_test_split(grouped_data, test_size=0.3, random_state=99)

train_data = TabularDataset(train_data)
val_data = TabularDataset(val_data)

quality = 'medium_quality' #Lowest size model used due to computer power restriction
predictor = TabularPredictor(label=target,eval_metric=metric).fit(train_data,presets=quality)  
y_pred = predictor.predict(val_data.drop(columns=[target]))


print(predictor.evaluate(val_data))
print(predictor.leaderboard(val_data))
print(predictor.feature_importance(val_data))