In [1]:
import pandas as pd
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import unicodedata

## 1. Data Acquisition

**Data Citation:**
```
@article{asai2023selfrag,
  author    = {Asai, Akari and Wu, Zeqiu and Wang, Yizhong and Sil, Avirup and Hajishirzi, Hannaneh},
  title     = {{Self-RAG}: Learning to Retrieve, Generate, and Critique through Self-Reflection},
  year      = {2023},
  journal   = { arXiv preprint arXiv:2310.11511 },
  URL       = {https://arxiv.org/abs/2310.11511}
}
```


In [2]:
# Load data
raw_df = pd.read_json("hf://datasets/selfrag/selfrag_train_data/train.jsonl", lines=True)
raw_df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,instruction,output,input,id,dataset_name
0,"In this task, you are given a context paragrap...",[Retrieval]<paragraph>2017 Portland train atta...,,flan_v2_88425,flan_v2
1,"Question: Write a text based on ""rangers show ...",[No Retrieval]six opposition candidates in the...,,flan_v2_18667,flan_v2
2,You will be given a sentence. Check whether th...,[No Retrieval]1\n****\n[Utility:5],,flan_v2_87754,flan_v2
3,Q:Is there a negative or positive tone to this...,[No Retrieval]Negative[Utility:5],,flan_v2_47789,flan_v2
4,Question: Fertility Clinics Vary on Embryo Dis...,[No Retrieval]Answer: World[Utility:5],,flan_v2_10803,flan_v2


In [3]:
def check_retrieval(output) -> str:
  """Function to denote if retrieval took place when generating an output"""
  if re.search(r"\[Retrieval\]", output):
    return "true"
  elif re.search(r"\[No Retrieval\]", output):
    return "false"
  else:
    # Some outputs are tagged with `[Continue to Use Evidence]` and `Dialog`.
    # We could flag these as false but we will ignore them instead.
    return "null"

In [4]:
# Apply check retrieval function
raw_df["retrieval"] = raw_df["output"].apply(check_retrieval)

In [5]:
# Check for balance
raw_df["retrieval"].value_counts()

Unnamed: 0_level_0,count
retrieval,Unnamed: 1_level_1
True,74219
False,70362
,1038


In [6]:
# Remove unwanted rows
raw_df.drop(raw_df.loc[raw_df["retrieval"] == "null"].index, inplace=True)
raw_df["retrieval"].value_counts()

Unnamed: 0_level_0,count
retrieval,Unnamed: 1_level_1
True,74219
False,70362


## 2. Text Cleaning and Preprocessing

In [13]:
nlp = spacy.load("en_core_web_sm", disable=["ents", "ner", "parser", "pos_"])

In [14]:
def clean_preprocess(texts):
  cleaned_texts = []
  for doc in nlp.pipe(texts, batch_size=5000):
      tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and not token.like_email]
      cleaned_texts.append(" ".join(tokens))
  return cleaned_texts

In [None]:
raw_df["cleaned_instruction"] = clean_preprocess(raw_df["instruction"].values)

In [None]:
raw_df.head()