# SHRROM EXPLORATION

## Setup

### Install

In [1]:
%pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: captum
Successfully installed captum-0.7.0


In [2]:
import json
import os
import random
import sys

In [3]:
import pandas as pd
import numpy as np

import torch
from captum.attr import IntegratedGradients
from torch.nn.functional import softmax

import huggingface_hub
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration

### Data Loading

In [4]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data_dir = "/content/drive/MyDrive/SHROOM/data"

In [6]:
with open(f"{data_dir}/SHROOM_unlabeled-training-data-v2/train.model-aware.v2.json") as f:
  train_data = json.load(f)

with open(f"{data_dir}/SHROOM_dev-v2/val.model-aware.v2.json") as f:
  dev_data = json.load(f)

In [7]:
def prep_df(json_data):
  print(json_data[0])
  print(json_data[0].keys())
  _df = pd.DataFrame(json_data)
  print(_df.task.unique())
  _df = _df.query("task == 'DM'")
  _df = _df.reset_index()
  return _df

In [8]:
train_df = prep_df(train_data)
val_df = prep_df(dev_data)

{'hyp': 'Of or pertaining to the language of a particular area , or to a particular', 'tgt': 'Of or pertaining to everyday language , as opposed to standard , literary , liturgical , or scientific idiom .', 'src': 'There are blacktips , silvertips , bronze whalers , black whalers , spinner sharks , and bignose sharks . these of course are vernacular names , but this is one case where the scientific nomenclature does not clarify the species , since it is now being revised . What is the meaning of vernacular ?', 'ref': 'tgt', 'task': 'DM', 'model': 'ltg/flan-t5-definition-en-base'}
dict_keys(['hyp', 'tgt', 'src', 'ref', 'task', 'model'])
['DM' 'PG' 'MT']
{'hyp': 'A sloping top .', 'ref': 'tgt', 'src': 'The sides of the casket were covered with heavy black broadcloth , with velvet caps , presenting a deep contrast to the rich surmountings . What is the meaning of surmounting ?', 'tgt': 'A decorative feature that sits on top of something .', 'model': 'ltg/flan-t5-definition-en-base', 'task

### Model loading

In [9]:
# tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

tokenizer = AutoTokenizer.from_pretrained("ltg/flan-t5-definition-en-base")
model = AutoModelForSeq2SeqLM.from_pretrained("ltg/flan-t5-definition-en-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

## Probability Asseesement

In [10]:
val_df.head()

Unnamed: 0,index,hyp,ref,src,tgt,model,task,labels,label,p(Hallucination)
0,0,A sloping top .,tgt,The sides of the casket were covered with heav...,A decorative feature that sits on top of somet...,ltg/flan-t5-definition-en-base,DM,"[Not Hallucination, Hallucination, Not Halluci...",Hallucination,0.6
1,1,To react too much .,tgt,Please try not to overreact if she drives badl...,To react too much or too intensely .,ltg/flan-t5-definition-en-base,DM,"[Not Hallucination, Not Hallucination, Not Hal...",Not Hallucination,0.0
2,2,The process of spoiling ; the state of being s...,tgt,"To prevent spoilage , store in a cool , dry pl...",The process of spoiling .,ltg/flan-t5-definition-en-base,DM,"[Hallucination, Not Hallucination, Hallucinati...",Hallucination,0.6
3,3,To arrange in a particular way .,tgt,The way the opposition has framed the argument...,To construct in words so as to establish a con...,ltg/flan-t5-definition-en-base,DM,"[Hallucination, Not Hallucination, Not Halluci...",Hallucination,0.6
4,4,A feeling of concern ; a feeling of anxiety .,tgt,To mix with thy concernments i desist . What i...,That in which one is concerned or interested ;...,ltg/flan-t5-definition-en-base,DM,"[Not Hallucination, Hallucination, Hallucinati...",Hallucination,0.6


In [11]:
# string_input = "What is the definition of computer?"
string_input = val_df.iloc[1]["src"]

print("label: ", val_df.iloc[1]["label"])
print("tgt: ", val_df.iloc[1]["tgt"])
print("hyp: ", val_df.iloc[1]["hyp"])

inputs = tokenizer(string_input, return_tensors="pt")
response_tokens = model.generate(
    **inputs,
    max_length=50,
    temperature=1,
)

response_text = tokenizer.decode(
    response_tokens[0],
    skip_special_tokens=True
  )
print(response_text)

repsonse_words = response_text.split(" ")

# re-tokenize the response text to get the input IDs for prop calc
# we need logits --> pass through model
output_ids = tokenizer.encode(response_text, return_tensors="pt")
decoder_input_ids = output_ids[:, :-1]

model.eval()
with torch.no_grad():
    outputs = model(
        inputs['input_ids'],
        decoder_input_ids=decoder_input_ids,
        use_cache=False
    )
    last_layer_logits = outputs.logits
    probabilities = softmax(last_layer_logits, dim=-1)

# Print token probabilities and ranks
for i, idx in enumerate(output_ids[0, 1:]):
    token = tokenizer.decode(idx)
    token_probability = probabilities[0, i, idx].item()

    # perc-based rank
    rank = (probabilities[0, i] > token_probability).sum().item() / probabilities.size(-1) * 100
    print(f"Token: {token.ljust(25)} Probability: {str(token_probability).ljust(30)}| Rank: {rank:.2f}%")


label:  Not Hallucination
tgt:  To react too much or too intensely .
hyp:  To react too much .
To react too much.
Token: react                     Probability: 1.1615760797667463e-07        | Rank: 20.75%
Token: too                       Probability: 1.845110091380775e-05         | Rank: 1.13%
Token: much                      Probability: 0.08375450223684311           | Rank: 0.01%
Token: .                         Probability: 0.020109592005610466          | Rank: 0.01%
Token: </s>                      Probability: 0.9796644449234009            | Rank: 0.00%


## Dataset Building

In [13]:
data = []

for index, row in val_df.iterrows():
    string_input = row["src"]
    label = row["label"]

    # gen resp
    inputs = tokenizer(string_input, return_tensors="pt")
    response_tokens = model.generate(**inputs, max_length=50, temperature=1)
    response_text = tokenizer.decode(response_tokens[0], skip_special_tokens=True)
    response_words = response_text.split(" ")

    # re-tokenize and calc prop.
    output_ids = tokenizer.encode(response_text, return_tensors="pt")
    decoder_input_ids = output_ids[:, :-1]

    with torch.no_grad():
        outputs = model(
            inputs['input_ids'],
            decoder_input_ids=decoder_input_ids,
            use_cache=False
        )
        last_layer_logits = outputs.logits
        probabilities = softmax(last_layer_logits, dim=-1)

        # token-spec data
        for i, idx in enumerate(output_ids[0, 1:]):
            token = tokenizer.decode(idx)
            token_probability = probabilities[0, i, idx].item()
            rank = (probabilities[0, i] > token_probability).sum().item() / probabilities.size(-1) * 100

            data.append({
                "Position": i+1,
                "Token": token,
                "Probability": token_probability,
                "Rank": rank,
                "Label": label
            })

result_df = pd.DataFrame(data)
print(result_df.head())

   Position Token   Probability      Rank          Label
0         1        7.460105e-03  0.003113  Hallucination
1         2     s  2.092055e-05  0.096489  Hallucination
2         3   lop  5.906224e-10  7.575946  Hallucination
3         4   ing  6.899348e-03  0.006225  Hallucination
4         5   top  6.939777e-08  3.190364  Hallucination


In [42]:
result_df.shape

(1803, 5)

In [60]:
import numpy as np
import pandas as pd

matrix_length = 20
num_sentences = result_df[result_df['Position'] == 1].shape[0]
matrix = np.zeros((num_sentences, matrix_length, 2))  # 2 for position and probability
y_labels = []

matrix_row = -1
matrix_position = 0

for index, row in result_df.iterrows():
    if row['Position'] == 1:
        matrix_row += 1
        matrix_position = 0
        label_i = 1 if row['Label'] == "Hallucination" else 0
        y_labels.append(label_i)
    else:
        matrix_position += 1

    if matrix_position < matrix_length:
        matrix[matrix_row, matrix_position, 0] = row['Position']
        matrix[matrix_row, matrix_position, 1] = row['Probability']

matrix = matrix[:matrix_row+1, :, :]

print(matrix[0])

[[1.00000000e+00 7.46010523e-03]
 [2.00000000e+00 2.09205464e-05]
 [3.00000000e+00 5.90622440e-10]
 [4.00000000e+00 6.89934753e-03]
 [5.00000000e+00 6.93977711e-08]
 [6.00000000e+00 1.33023039e-03]
 [7.00000000e+00 4.89225686e-01]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00]]


## Exploration of Classification

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

num_sentences = matrix.shape[0]
num_features = matrix.shape[1] * matrix.shape[2]  # matrix_length * 2

# flatten the 3D matrix into a 2D array
X_flat = matrix.reshape(num_sentences, num_features)
y = np.array(y_labels)

X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.2, random_state=42)


In [None]:
svm_model = SVC(
    kernel='poly',
    degree = 5
)
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)
svm_accuracy = classification_report(y_test, y_pred)
print(f'SVM Accuracy: \n{svm_accuracy}')


In [73]:
from sklearn.ensemble import GradientBoostingClassifier

In [77]:
clf = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=1.2,
    max_depth=10,
    random_state=42
).fit(X_train, y_train)

clf.score(X_test, y_test)

0.5

### Optim

This takes quite some time to run so be carefull lol

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'n_estimators': [100, 200, 250, 300, 400],
    'learning_rate': [0.01, 0.1, 0.5, 0.7, 1.0, 1.1, 1.2],
    'max_depth': [2, 3, 5, 7, 8, 10, 12]
}

clf = GradientBoostingClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=57, scoring='f1', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_clf = grid_search.best_estimator_
test_score = best_clf.score(X_test, y_test)
print("Test Score:", test_score)