In [1]:
from datasets import load_dataset
import openai
import os
import itertools
import logging
import sys

import dill as pickle
import dspy
from dsp.utils import deduplicate
from dspy.teleprompt import BootstrapFewShot, LabeledFewShot, BayesianSignatureOptimizer, BootstrapFewShotWithRandomSearch
from dspy.evaluate import answer_exact_match, answer_passage_match
from dspy.evaluate.evaluate import Evaluate
from dsp.utils import deduplicate
from dspy.primitives import module

from copy import copy
import random
import json
import tqdm
import pickle
import pandas as pd
from pandas import json_normalize
import numpy as np

import models

random.seed(1)

In [2]:
from dotenv import load_dotenv

load_dotenv() 

True

## LM and RM

In [3]:
# setting lm and rm in dspy
openai_key = os.environ["OPENAI_API_KEY"]
colbert_server = 'http://index.contextual.ai:8893/api/search'

lm = dspy.OpenAI(model='gpt-3.5-turbo', api_key=openai_key)
rm = dspy.ColBERTv2(url=colbert_server)
dspy.settings.configure(lm=lm, rm=rm)

## Dataset

In [4]:
# dataset download and split
def get_squad_split(squad, split="validation"):
    data = zip(*[squad[split][field] for field in squad[split].features])
    exs = [dspy.Example(question=q, answer=a['text'][0]).with_inputs("question")
           for eid, title, context, q, a in data]
    return exs

squad = load_dataset("squad")
squad_train = get_squad_split(squad, split="train")
squad_dev = get_squad_split(squad)
dev_exs = random.sample(squad_dev, k=20)

In [5]:
def get_squad_df(squad, split="validation", sample = False, sample_size = 50):
    data = squad[split]
    df = pd.json_normalize(data)
    if sample == True:
        df = df.sample(n=sample_size, random_state=1).reset_index(drop=True)
    df['answer'] = df['answers.text'].apply(lambda x: x[0])
    # df.rename({'answers.text': 'answer'}, axis=1, inplace=True)
    df = df[['context', 'question', 'answer']]
    return df

In [6]:
squad_open = get_squad_df(squad, 'train', True)
squad_close = get_squad_df(squad, 'train', True)
squad_open.head()

Unnamed: 0,context,question,answer
0,"On the next day, December 18, protests turned ...",How many people were estimated by authorities ...,3000
1,Roman Catholicism was the sole established rel...,"Until the Reformation, what was the establishe...",Roman Catholicism
2,Israel retaliated against Egyptian shelling wi...,When did the war start up again?,March 1969
3,Zen Buddhist teaching is often full of paradox...,What Buddhist teachings are often full of para...,Zen
4,Sahih al-Bukhari narrates Muhammad describing ...,In which work did Welch express his belief tha...,Encyclopaedia of Islam


In [7]:
def get_halueval_split(dataset):
    data = zip(*[dataset['data'][field] for field in dataset['data'].features])
    exs = [dspy.Example(question=question, context=knowledge, right_answer=right_ans, hallucinated_answer=halu_ans).with_inputs("question")
           for knowledge, question, right_ans, halu_ans in data]
    return exs

halueval_qa = load_dataset('pminervini/HaluEval', 'qa')
halu_ds = get_halueval_split(halueval_qa)
halu_dev = random.sample(halu_ds, k=20)

In [8]:
len(halu_ds)

10000

In [9]:
def get_he_df(squad, split="data", sample = False, sample_size = 50):
    data = squad[split]
    df = pd.json_normalize(data)
    if sample == True:
        df = df.sample(n=sample_size, random_state=1).reset_index(drop=True)
    df = df[['knowledge', 'question', 'right_answer', 'hallucinated_answer']]
    return df

In [10]:
# he_close = get_he_df(halueval_qa, 'data', True)
he_open = get_he_df(halueval_qa, 'data', True)
he_open

Unnamed: 0,knowledge,question,right_answer,hallucinated_answer
0,Jack Elam is cast in occasional episodes as s...,Where did the actor who played sidekick Toothy...,the United States Navy,Jack Elam served in the Air Force.
1,"Loewald's father, who died shortly after his b...",What was the nationality of the man with whom ...,German,Hans Loewald studied philosophy with a French ...
2,The Knicks–Nuggets brawl was an on-court alter...,The Knicks-Nuggests brawl was the most penaliz...,"Auburn Hills, Michigan",The Knicks-Nuggets brawl was more infamous tha...
3,It was written by Paul McCartney (credited to...,"The song ""Your Mother Should Know"" is based on...",1961,"The song ""Your Mother Should Know"" is based on..."
4,"""The Adventures of Supergirl"" is the first epi...",ON what date did the man who played Michael Su...,"October 10, 2016",Tyler Hoechlin first appeared in The Adventure...
5,"Jules Sitruk (born April 16, 1990 in Lilas, ne...","What comedy film, written and directed by Gart...",Son of Rambow,Jules Sitruk did not act in any of Garth Jenni...
6,Non-Stop is a 2014 American mystery action thr...,Non-Stop starred the English actress best know...,Lady Mary Crawley,Michelle Dockery played Lady Mary.
7,Baby Blue is the fourth studio album by Mexica...,What is the is the fourth studio album by Mexi...,Baby Blue,"Anahí is married to Governor of Chiapas, Manue..."
8,"Homage to the Queen, Op. 42, by Malcolm Arnold...",What was the nationality of the costume design...,English,The costume designer for Homage to the Queen w...
9,Shortia is a small genus of subshrubs or peren...,"Which genus, Shortia or Schizophragma, has mor...",Shortia,Schizophragma has fewer species.


## Segmentation

In [11]:
from nltk.tokenize import sent_tokenize
import nltk

try:
    nltk.download("punkt", quiet=True)
except FileExistsError:  # multiprocessing race condition
    pass

In [12]:
class SentenceSplitter:
    def __init__(self):
        pass

    # use NLTK to split a text string into sentences for now
    def split_into_sentences(self, text):
        return sent_tokenize(text)

In [13]:
splitter = SentenceSplitter()

## Hallucination check

In [14]:
halu_check_model = models.HaluCheckRAG()
splitter = SentenceSplitter()

In [15]:
ex = halu_ds[0]
ex

Example({'question': "Which magazine was started first Arthur's Magazine or First for Women?", 'context': "Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.", 'right_answer': "Arthur's Magazine", 'hallucinated_answer': 'First for Women was started first.'}) (input_keys={'question'})

In [16]:
halu_check_model(question=ex['question'], response=ex['hallucinated_answer'], context=ex['context'])

True

In [17]:
import importlib
importlib.reload(models)

<module 'models' from '/home/michael/git/cs224u-final-project/models.py'>

In [18]:
modify_model = models.ModifyOpenQARAG()

In [19]:
modify_model(question=ex['question'],response=ex['hallucinated_answer'], context=ex['context'])

"Arthur's Magazine was started first."

In [20]:
lm.inspect_history(1)





The response to the question is wrong. Generate a correct answer to the question that is different from the given response and aligns with the provided context.

---

Follow the following format.

Context: may contain relevant facts

Question: ${question}

Response: ${response}

Answer: Based on the context and taking the feedback on the response to the question into consideration, answer the question with short answers (limited to less than 6 words)

---

Context: Arthur's Magazine (1844–1846) was an American literary periodical published in Philadelphia in the 19th century.First for Women is a woman's magazine published by Bauer Media Group in the USA.

Question: Which magazine was started first Arthur's Magazine or First for Women?

Response: First for Women was started first.

Answer:[32m Arthur's Magazine was started first.[0m





In [31]:
records = []
index = -1
for ind in tqdm.tqdm(range(4000, len(halu_ds))):
    ex = halu_ds[ind]
    check_right_answer = halu_check_model(question=ex['question'], response=ex['right_answer'], context=ex['context'])
    check_hallucinated_answer = halu_check_model(question=ex['question'], response=ex['hallucinated_answer'], context=ex['context'])
    if check_right_answer is None:
        print(f"Wrong value for check_right_answer, index {ind}: answer {check_right_answer}")
    if check_hallucinated_answer  is None:
        print(f"Wrong value for check_hallucinated_answer, index {ind}: answer {check_hallucinated_answer}")
    if check_right_answer is not None:
        check_right_answer = int(check_right_answer)
    if check_hallucinated_answer is not None:
        check_hallucinated_answer = int(check_hallucinated_answer)
    index += 1
    records.append({
        'index': index,
        'original_index': ind,
        'label': 0,
        'prediction': check_right_answer
    })
    index += 1
    records.append({
        'index': index,
        'original_index': ind,
        'label': 1,
        'prediction': check_hallucinated_answer
    })
    if ind % 50 == 0:
        print(f'Checkpointing records with index {ind}')
        with open('records.pkl', 'wb') as file:
            pickle.dump(records, file)
df_results_halu_check = pd.DataFrame.from_records(records, index='index')
df_results_halu_check

  0%|          | 0/6000 [00:00<?, ?it/s]

Checkpointing records with index 4000
Wrong value for check_right_answer, index 4019: answer None


  1%|          | 50/6000 [00:31<1:30:41,  1.09it/s]

Checkpointing records with index 4050


  2%|▏         | 101/6000 [02:21<3:26:47,  2.10s/it]

Checkpointing records with index 4100


  3%|▎         | 151/6000 [04:02<3:27:37,  2.13s/it]

Checkpointing records with index 4150


  3%|▎         | 201/6000 [05:41<3:20:44,  2.08s/it]

Checkpointing records with index 4200


  4%|▍         | 251/6000 [07:28<2:56:40,  1.84s/it]

Checkpointing records with index 4250


  5%|▌         | 301/6000 [09:10<3:11:30,  2.02s/it]

Checkpointing records with index 4300


  6%|▌         | 351/6000 [10:53<3:37:12,  2.31s/it]

Checkpointing records with index 4350


  7%|▋         | 401/6000 [12:40<3:26:24,  2.21s/it]

Checkpointing records with index 4400


  8%|▊         | 451/6000 [14:21<3:08:26,  2.04s/it]

Checkpointing records with index 4450


  8%|▊         | 501/6000 [16:09<3:35:18,  2.35s/it]

Checkpointing records with index 4500


  9%|▉         | 551/6000 [17:51<2:44:44,  1.81s/it]

Checkpointing records with index 4550


 10%|█         | 601/6000 [19:35<2:46:16,  1.85s/it]

Checkpointing records with index 4600


 11%|█         | 651/6000 [21:24<3:43:48,  2.51s/it]

Checkpointing records with index 4650


 12%|█▏        | 701/6000 [23:05<2:52:49,  1.96s/it]

Checkpointing records with index 4700


 13%|█▎        | 751/6000 [25:00<2:51:26,  1.96s/it]

Checkpointing records with index 4750


 13%|█▎        | 801/6000 [26:50<2:56:59,  2.04s/it]

Checkpointing records with index 4800


 14%|█▍        | 851/6000 [28:34<3:26:55,  2.41s/it]

Checkpointing records with index 4850


 15%|█▌        | 901/6000 [30:28<3:18:30,  2.34s/it]

Checkpointing records with index 4900


 16%|█▌        | 951/6000 [32:15<2:55:41,  2.09s/it]

Checkpointing records with index 4950


 17%|█▋        | 1001/6000 [34:05<2:52:06,  2.07s/it]

Checkpointing records with index 5000


 18%|█▊        | 1051/6000 [35:51<2:35:19,  1.88s/it]

Checkpointing records with index 5050


 18%|█▊        | 1101/6000 [37:42<2:29:16,  1.83s/it]

Checkpointing records with index 5100


 19%|█▉        | 1151/6000 [39:30<2:46:47,  2.06s/it]

Checkpointing records with index 5150


 20%|██        | 1201/6000 [41:21<2:35:21,  1.94s/it]

Checkpointing records with index 5200


 21%|██        | 1251/6000 [43:07<2:29:30,  1.89s/it]

Checkpointing records with index 5250


 22%|██▏       | 1301/6000 [44:56<2:37:23,  2.01s/it]

Checkpointing records with index 5300


 23%|██▎       | 1351/6000 [46:45<3:11:37,  2.47s/it]

Checkpointing records with index 5350


 23%|██▎       | 1401/6000 [48:32<2:38:26,  2.07s/it]

Checkpointing records with index 5400


 24%|██▍       | 1451/6000 [50:19<2:44:08,  2.16s/it]

Checkpointing records with index 5450


 25%|██▌       | 1501/6000 [52:11<2:44:27,  2.19s/it]

Checkpointing records with index 5500


 26%|██▌       | 1551/6000 [54:01<2:37:24,  2.12s/it]

Checkpointing records with index 5550


 27%|██▋       | 1601/6000 [55:48<2:47:34,  2.29s/it]

Checkpointing records with index 5600


 28%|██▊       | 1651/6000 [57:34<2:32:44,  2.11s/it]

Checkpointing records with index 5650


 28%|██▊       | 1701/6000 [59:24<2:22:16,  1.99s/it]

Checkpointing records with index 5700


 29%|██▉       | 1751/6000 [1:01:22<2:38:10,  2.23s/it]

Checkpointing records with index 5750


 30%|███       | 1801/6000 [1:03:11<2:29:01,  2.13s/it]

Checkpointing records with index 5800


 31%|███       | 1851/6000 [1:05:07<2:48:08,  2.43s/it]

Checkpointing records with index 5850


 32%|███▏      | 1901/6000 [1:06:55<2:37:49,  2.31s/it]

Checkpointing records with index 5900


 33%|███▎      | 1951/6000 [1:08:48<2:34:38,  2.29s/it]

Checkpointing records with index 5950


 33%|███▎      | 2001/6000 [1:10:36<2:18:20,  2.08s/it]

Checkpointing records with index 6000


 34%|███▍      | 2051/6000 [1:12:20<2:23:44,  2.18s/it]

Checkpointing records with index 6050


 35%|███▌      | 2101/6000 [1:14:06<2:38:48,  2.44s/it]

Checkpointing records with index 6100


 36%|███▌      | 2151/6000 [1:16:00<2:33:20,  2.39s/it]

Checkpointing records with index 6150


 37%|███▋      | 2201/6000 [1:17:45<1:58:47,  1.88s/it]

Checkpointing records with index 6200


 38%|███▊      | 2251/6000 [1:19:33<2:10:24,  2.09s/it]

Checkpointing records with index 6250


 38%|███▊      | 2301/6000 [1:21:18<2:06:22,  2.05s/it]

Checkpointing records with index 6300


 39%|███▉      | 2351/6000 [1:23:04<2:24:25,  2.37s/it]

Checkpointing records with index 6350


 40%|████      | 2401/6000 [1:24:59<2:29:45,  2.50s/it]

Checkpointing records with index 6400


 41%|████      | 2451/6000 [1:26:52<2:05:44,  2.13s/it]

Checkpointing records with index 6450


 42%|████▏     | 2501/6000 [1:28:43<2:14:04,  2.30s/it]

Checkpointing records with index 6500


 43%|████▎     | 2551/6000 [1:30:30<1:59:05,  2.07s/it]

Checkpointing records with index 6550


 43%|████▎     | 2601/6000 [1:32:20<2:21:37,  2.50s/it]

Checkpointing records with index 6600


 44%|████▍     | 2651/6000 [1:34:09<1:57:29,  2.10s/it]

Checkpointing records with index 6650


 45%|████▌     | 2701/6000 [1:36:04<2:04:53,  2.27s/it]

Checkpointing records with index 6700


 46%|████▌     | 2751/6000 [1:38:01<1:49:20,  2.02s/it]

Checkpointing records with index 6750


 47%|████▋     | 2801/6000 [1:39:53<2:03:51,  2.32s/it]

Checkpointing records with index 6800


 48%|████▊     | 2851/6000 [1:41:51<1:57:27,  2.24s/it]

Checkpointing records with index 6850


 48%|████▊     | 2901/6000 [1:43:42<1:56:59,  2.27s/it]

Checkpointing records with index 6900


 49%|████▉     | 2951/6000 [1:45:26<1:40:43,  1.98s/it]

Checkpointing records with index 6950


 50%|█████     | 3001/6000 [1:47:22<1:50:07,  2.20s/it]

Checkpointing records with index 7000


 51%|█████     | 3051/6000 [1:49:34<1:43:17,  2.10s/it]

Checkpointing records with index 7050


 52%|█████▏    | 3101/6000 [1:51:27<1:53:36,  2.35s/it]

Checkpointing records with index 7100


 53%|█████▎    | 3151/6000 [1:53:20<1:31:58,  1.94s/it]

Checkpointing records with index 7150


 53%|█████▎    | 3157/6000 [1:53:31<1:28:29,  1.87s/it]

Wrong value for check_hallucinated_answer, index 7156: answer None


 53%|█████▎    | 3201/6000 [1:55:11<1:35:06,  2.04s/it]

Checkpointing records with index 7200


 54%|█████▍    | 3251/6000 [1:57:05<1:36:36,  2.11s/it]

Checkpointing records with index 7250


 55%|█████▌    | 3301/6000 [1:58:55<1:47:36,  2.39s/it]

Checkpointing records with index 7300


 56%|█████▌    | 3351/6000 [2:00:42<1:46:03,  2.40s/it]

Checkpointing records with index 7350


 57%|█████▋    | 3401/6000 [2:02:38<1:27:35,  2.02s/it]

Checkpointing records with index 7400


 58%|█████▊    | 3451/6000 [2:04:30<1:32:59,  2.19s/it]

Checkpointing records with index 7450


 58%|█████▊    | 3501/6000 [2:06:24<1:38:55,  2.38s/it]

Checkpointing records with index 7500


 59%|█████▉    | 3551/6000 [2:08:14<1:28:53,  2.18s/it]

Checkpointing records with index 7550


 60%|██████    | 3601/6000 [2:10:10<1:34:07,  2.35s/it]

Checkpointing records with index 7600


 61%|██████    | 3651/6000 [2:12:02<1:29:57,  2.30s/it]

Checkpointing records with index 7650


 62%|██████▏   | 3701/6000 [2:13:49<1:15:58,  1.98s/it]

Checkpointing records with index 7700


 63%|██████▎   | 3751/6000 [2:15:45<1:34:47,  2.53s/it]

Checkpointing records with index 7750


 63%|██████▎   | 3801/6000 [2:17:34<1:19:00,  2.16s/it]

Checkpointing records with index 7800


 64%|██████▍   | 3851/6000 [2:19:29<1:09:12,  1.93s/it]

Checkpointing records with index 7850


 65%|██████▌   | 3901/6000 [2:21:21<1:20:43,  2.31s/it]

Checkpointing records with index 7900


 66%|██████▌   | 3951/6000 [2:23:17<1:15:56,  2.22s/it]

Checkpointing records with index 7950


 67%|██████▋   | 4001/6000 [2:25:10<1:18:16,  2.35s/it]

Checkpointing records with index 8000


 68%|██████▊   | 4051/6000 [2:27:16<1:14:38,  2.30s/it]

Checkpointing records with index 8050


 68%|██████▊   | 4101/6000 [2:29:20<1:19:56,  2.53s/it]

Checkpointing records with index 8100


 69%|██████▉   | 4151/6000 [2:31:13<1:02:11,  2.02s/it]

Checkpointing records with index 8150


 70%|███████   | 4201/6000 [2:33:02<57:15,  1.91s/it]  

Checkpointing records with index 8200


 71%|███████   | 4251/6000 [2:34:57<1:09:51,  2.40s/it]

Checkpointing records with index 8250


 72%|███████▏  | 4301/6000 [2:36:49<1:07:36,  2.39s/it]

Checkpointing records with index 8300


 73%|███████▎  | 4351/6000 [2:38:40<58:11,  2.12s/it]  

Checkpointing records with index 8350


 73%|███████▎  | 4401/6000 [2:40:35<52:29,  1.97s/it]  

Checkpointing records with index 8400


 74%|███████▍  | 4451/6000 [2:42:36<1:08:56,  2.67s/it]

Checkpointing records with index 8450


 75%|███████▌  | 4501/6000 [2:44:34<1:05:42,  2.63s/it]

Checkpointing records with index 8500


 76%|███████▌  | 4551/6000 [2:46:33<1:01:16,  2.54s/it]

Checkpointing records with index 8550


 77%|███████▋  | 4601/6000 [2:48:23<51:09,  2.19s/it]  

Checkpointing records with index 8600


 78%|███████▊  | 4651/6000 [2:50:18<1:06:31,  2.96s/it]

Checkpointing records with index 8650


 78%|███████▊  | 4701/6000 [2:52:20<51:21,  2.37s/it]  

Checkpointing records with index 8700


 79%|███████▉  | 4751/6000 [2:54:13<42:59,  2.07s/it]

Checkpointing records with index 8750


 80%|████████  | 4801/6000 [2:56:02<44:21,  2.22s/it]

Checkpointing records with index 8800


 81%|████████  | 4851/6000 [2:57:57<33:47,  1.76s/it]  

Checkpointing records with index 8850


 82%|████████▏ | 4901/6000 [2:59:58<44:35,  2.43s/it]  

Checkpointing records with index 8900


 83%|████████▎ | 4951/6000 [3:01:44<35:35,  2.04s/it]

Checkpointing records with index 8950


 83%|████████▎ | 5001/6000 [3:03:40<39:02,  2.34s/it]

Checkpointing records with index 9000


 84%|████████▍ | 5051/6000 [3:05:40<41:41,  2.64s/it]

Checkpointing records with index 9050


 85%|████████▌ | 5101/6000 [3:07:43<30:23,  2.03s/it]

Checkpointing records with index 9100


 86%|████████▌ | 5151/6000 [3:09:38<31:39,  2.24s/it]

Checkpointing records with index 9150


 87%|████████▋ | 5201/6000 [3:11:35<29:22,  2.21s/it]

Checkpointing records with index 9200


 88%|████████▊ | 5251/6000 [3:13:28<24:56,  2.00s/it]

Checkpointing records with index 9250


 88%|████████▊ | 5301/6000 [3:15:27<26:02,  2.24s/it]

Checkpointing records with index 9300


 89%|████████▉ | 5351/6000 [3:17:18<25:34,  2.36s/it]

Checkpointing records with index 9350


 90%|█████████ | 5401/6000 [3:19:16<24:03,  2.41s/it]

Checkpointing records with index 9400


 91%|█████████ | 5451/6000 [3:21:08<23:29,  2.57s/it]

Checkpointing records with index 9450


 92%|█████████▏| 5501/6000 [3:22:58<17:18,  2.08s/it]

Checkpointing records with index 9500


 93%|█████████▎| 5551/6000 [3:24:56<17:38,  2.36s/it]

Checkpointing records with index 9550


 93%|█████████▎| 5601/6000 [3:26:57<17:24,  2.62s/it]

Checkpointing records with index 9600


 94%|█████████▍| 5651/6000 [3:28:52<14:03,  2.42s/it]

Checkpointing records with index 9650


 95%|█████████▌| 5701/6000 [3:30:46<12:12,  2.45s/it]

Checkpointing records with index 9700


 96%|█████████▌| 5751/6000 [3:32:43<08:55,  2.15s/it]

Checkpointing records with index 9750


 97%|█████████▋| 5801/6000 [3:34:34<07:31,  2.27s/it]

Checkpointing records with index 9800


 98%|█████████▊| 5851/6000 [3:36:23<06:05,  2.45s/it]

Checkpointing records with index 9850


 98%|█████████▊| 5901/6000 [3:38:15<03:18,  2.00s/it]

Checkpointing records with index 9900


 99%|█████████▉| 5951/6000 [3:40:03<01:48,  2.21s/it]

Checkpointing records with index 9950


100%|██████████| 6000/6000 [3:41:50<00:00,  2.22s/it]


Unnamed: 0_level_0,original_index,label,prediction
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0.0
1,0,1,1.0
2,1,0,0.0
3,1,1,1.0
4,2,0,1.0
...,...,...,...
19996,9997,1,0.0
19997,9998,0,0.0
19998,9998,1,1.0
19999,9999,0,0.0


In [31]:
with open('records.pkl', 'rb') as file:
    records = pickle.load(file)
df_results_halu_check = pd.DataFrame.from_records(records, index='index')

In [36]:
# with open('df_results_halu_check.pkl', 'wb') as file:
#     pickle.dump(df_results_halu_check, file)

In [30]:
with open('df_results_halu_check.pkl', 'rb') as file:
    df_results_halu_check = pickle.load(file)

EOFError: Ran out of input

In [33]:
df_results_halu_check.dropna(inplace=True)

In [34]:
from sklearn.metrics import classification_report
classification_report(y_true=df_results_halu_check.loc[:,'label'], y_pred=df_results_halu_check.loc[:,'prediction'], output_dict=True)

{'0': {'precision': 0.7223054482377986,
  'recall': 0.9207035175879397,
  'f1-score': 0.809525913489153,
  'support': 9950.0},
 '1': {'precision': 0.8906747956214494,
  'recall': 0.6460301507537688,
  'f1-score': 0.7488786625502417,
  'support': 9950.0},
 'accuracy': 0.7833668341708543,
 'macro avg': {'precision': 0.806490121929624,
  'recall': 0.7833668341708542,
  'f1-score': 0.7792022880196974,
  'support': 19900.0},
 'weighted avg': {'precision': 0.806490121929624,
  'recall': 0.7833668341708543,
  'f1-score': 0.7792022880196974,
  'support': 19900.0}}

In [35]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_true=df_results_halu_check.loc[:,'label'], y_pred=df_results_halu_check.loc[:,'prediction']).ravel()
(tn, fp, fn, tp)

(9161, 789, 3522, 6428)