In [6]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import unicodedata
from sentence_transformers import models, SentenceTransformer

## 1. Data Acquisition

**Data Citation:**
```
@article{asai2023selfrag,
  author    = {Asai, Akari and Wu, Zeqiu and Wang, Yizhong and Sil, Avirup and Hajishirzi, Hannaneh},
  title     = {{Self-RAG}: Learning to Retrieve, Generate, and Critique through Self-Reflection},
  year      = {2023},
  journal   = { arXiv preprint arXiv:2310.11511 },
  URL       = {https://arxiv.org/abs/2310.11511}
}
```


In [None]:
# Load data
raw_df = pd.read_json("hf://datasets/selfrag/selfrag_train_data/train.jsonl", lines=True)
raw_df.head()

Unnamed: 0,instruction,output,input,id,dataset_name
0,"In this task, you are given a context paragrap...",[Retrieval]<paragraph>2017 Portland train atta...,,flan_v2_88425,flan_v2
1,"Question: Write a text based on ""rangers show ...",[No Retrieval]six opposition candidates in the...,,flan_v2_18667,flan_v2
2,You will be given a sentence. Check whether th...,[No Retrieval]1\n****\n[Utility:5],,flan_v2_87754,flan_v2
3,Q:Is there a negative or positive tone to this...,[No Retrieval]Negative[Utility:5],,flan_v2_47789,flan_v2
4,Question: Fertility Clinics Vary on Embryo Dis...,[No Retrieval]Answer: World[Utility:5],,flan_v2_10803,flan_v2


In [None]:
def check_retrieval(output) -> str:
  """Function to denote if retrieval took place when generating an output"""
  if re.search(r"\[Retrieval\]", output):
    return "true"
  elif re.search(r"\[No Retrieval\]", output):
    return "false"
  else:
    # Some outputs are tagged with `[Continue to Use Evidence]` and `Dialog`.
    # We could flag these as false but we will ignore them instead.
    return "null"

In [None]:
# Apply check retrieval function
raw_df["retrieval"] = raw_df["output"].apply(check_retrieval)

In [None]:
# Check for balance
raw_df["retrieval"].value_counts()

retrieval
true     74219
false    70362
null      1038
Name: count, dtype: int64

In [None]:
# Remove unwanted rows
raw_df.drop(raw_df.loc[raw_df["retrieval"] == "null"].index, inplace=True)
raw_df["retrieval"].value_counts()

retrieval
true     74219
false    70362
Name: count, dtype: int64

## 2. Text Cleaning and Preprocessing

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["ents", "ner", "parser", "pos_"])

In [None]:
def clean_preprocess(texts):
    with nlp.select_pipes(enable=["tokenizer", "lemmatizer", "attribute_ruler", "tagger"]):
        for doc in nlp.pipe(texts, batch_size=1000, n_process=2):
            tokens = [
                token.lemma_.lower() for token in doc if
                token.is_ascii and
                not token.is_stop and
                not token.is_punct and
                not token.like_email and
                not token.is_space
                ]
            yield " ".join(tokens)

In [None]:
raw_df["cleaned_instruction"] = list(clean_preprocess(raw_df["instruction"].values))
raw_df.head()

Unnamed: 0,instruction,output,input,id,dataset_name,retrieval,cleaned_instruction
0,"In this task, you are given a context paragrap...",[Retrieval]<paragraph>2017 Portland train atta...,,flan_v2_88425,flan_v2,True,task given context paragraph tweet question ta...
1,"Question: Write a text based on ""rangers show ...",[No Retrieval]six opposition candidates in the...,,flan_v2_18667,flan_v2,False,question write text based rangers iron tie ser...
2,You will be given a sentence. Check whether th...,[No Retrieval]1\n****\n[Utility:5],,flan_v2_87754,flan_v2,False,given sentence check sentence grammatically co...
3,Q:Is there a negative or positive tone to this...,[No Retrieval]Negative[Utility:5],,flan_v2_47789,flan_v2,False,q negative positive tone product review = = = ...
4,Question: Fertility Clinics Vary on Embryo Dis...,[No Retrieval]Answer: World[Utility:5],,flan_v2_10803,flan_v2,False,question fertility clinics vary embryo disposa...


In [None]:
raw_df.to_csv("cleaned_df.csv", columns=["cleaned_instruction", "retrieval"], index=False)

## 3. Feature Engineering

In [7]:
clean_df = pd.read_csv("https://github.com/Nate-Cheney/CIS-405-RAG_Classifier/raw/refs/heads/main/data/cleaned_df.csv")
clean_df.head()

Unnamed: 0,cleaned_instruction,retrieval
0,task given context paragraph tweet question ta...,True
1,question write text based rangers iron tie ser...,False
2,given sentence check sentence grammatically co...,False
3,q negative positive tone product review = = = ...,False
4,question fertility clinics vary embryo disposa...,False


In [None]:
clean_df = clean_df.dropna(subset=["cleaned_instruction"])

In [8]:
transformer = models.Transformer("sentence-transformers/all-MiniLM-L12-v2")
pooling = models.Pooling(transformer.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[transformer, pooling])

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [10]:
sentences = (clean_df["cleaned_instruction"].tolist())
embeddings = model.encode(sentences, show_progress_bar=True)

Batches:   0%|          | 0/4515 [00:00<?, ?it/s]

In [11]:
clean_df["embeddings"]  = embeddings.tolist()
clean_df.head()

Unnamed: 0,cleaned_instruction,retrieval,embeddings
0,task given context paragraph tweet question ta...,True,"[0.08893557637929916, 0.14358608424663544, 0.0..."
1,question write text based rangers iron tie ser...,False,"[0.09296396374702454, 0.04364759847521782, -0...."
2,given sentence check sentence grammatically co...,False,"[0.10677985846996307, 0.33720141649246216, 0.0..."
3,q negative positive tone product review = = = ...,False,"[-0.09037362039089203, 0.01577790640294552, -0..."
4,question fertility clinics vary embryo disposa...,False,"[0.07055126130580902, 0.28588759899139404, 0.0..."
