In [1]:
# default_exp squad_utils

### Create dataset in SQUAD format
> Create json files for question answering 

In [2]:
#export
from fastai.core import *

Here is sentiment extraction dataset with `text`, `selected_text` and `sentiment` given. This problem can also be formulated as a question answering problem. We wil consider `text` as `context`, `sentiment` as `question`, and `selected_text` as answer.

In [6]:
data_path = Path("../data/")

In [82]:
train_df = pd.read_csv(data_path/'train.csv')
train_df = train_df.dropna().reset_index(drop=True)
test_df = pd.read_csv(data_path/'test.csv')

In [83]:
train_df.head(2)

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative


In [84]:
test_df.head(2)

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive


The following is how a [SQUAD](https://rajpurkar.github.io/SQuAD-explorer/) data sample looks like.

In [48]:
#hide
squad_sample = {
    "version": "v2.0",
    "data": [
        {
            "title": "Beyonc\u00e9",
            "paragraphs": [
                {
                    "qas": [
                        {
                            "question": "When did Beyonce start becoming popular?",
                            "id": "56be85543aeaaa14008c9063",
                            "answers": [
                                {
                                    "text": "in the late 1990s",
                                    "answer_start": 269
                                }
                            ],
                            "is_impossible": False
                        }
                    ],
                    "context": "Beyonc\u00e9 Giselle Knowles-Carter (/bi\u02d0\u02c8j\u0252nse\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\"."
                }
            ]
        }
    ]
}

In [49]:
squad_sample

{'version': 'v2.0',
 'data': [{'title': 'Beyoncé',
   'paragraphs': [{'qas': [{'question': 'When did Beyonce start becoming popular?',
       'id': '56be85543aeaaa14008c9063',
       'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
       'is_impossible': False}],
     'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'}]}]

In [50]:
row = train_df.iloc[np.random.choice(range(len(train_df)))]
text_id = row['textID']
context = row['text']
answer = row['selected_text']
question = row['sentiment']
text_id, context, answer, question

('7cad826aa8',
 ' Ive been wanting 2 but will be gone 4 2 mo. & have to close my shops!',
 'Ive been wanting 2 but will be gone 4 2 mo. & have to close my shops!',
 'neutral')

In [51]:
#export
def get_answer_start(context, answer):
    len_a = len(answer)
    for i, _ in enumerate(context):
        if context[i:i+len_a] == answer: return i
    raise Exception("No overlapping segment found")

In [52]:
# test
start_idx = get_answer_start(context, answer)
assert context[start_idx:start_idx+len(answer)] == answer

In [53]:
#export
def generate_qas_dict(text_id, context, answer, question, impossible=False):
    qas_dict = {}
    qas_dict['question'] = question
    qas_dict['id'] = text_id
    qas_dict['is_impossible'] = impossible
    
    if answer is None: qas_dict['answers'] = []
    else: 
        answer_start = get_answer_start(context, answer)
        qas_dict['answers'] = [{"text":answer, "answer_start":answer_start}]
    return qas_dict

In [54]:
generate_qas_dict(text_id, context, answer, question)

{'question': 'neutral',
 'id': '7cad826aa8',
 'is_impossible': False,
 'answers': [{'text': 'Ive been wanting 2 but will be gone 4 2 mo. & have to close my shops!',
   'answer_start': 1}]}

In [58]:
#export
def create_squad_from_df(df):
    data_dicts = []
    for _, row in df.iterrows():
        text_id = row['textID']
        context = row['text']
        answer =  row['selected_text'] if 'selected_text' in row else None
        question = row['sentiment']

        qas_dict = generate_qas_dict(text_id, context, answer, question)
        data_dict = {"paragraphs" : [{"qas" : [qas_dict], "context":context}]}
        data_dict['title'] = text_id
        data_dicts.append(data_dict)
    return {"version": "v2.0", "data": data_dicts}

In [64]:
train_squad_dict = create_squad_from_df(train_df)

In [65]:
os.makedirs("../squad_data", exist_ok=True)

In [66]:
#export
def save_dict_as_json(d, fname):
    with open(fname, "w") as f: f.write(json.dumps(d))

In [67]:
save_dict_as_json(train_squad_dict, "../squad_data/train_squad_data.json")

In [69]:
#export
def read_json_as_dict(fname):
    return json.loads(open(fname).read())

In [70]:
train_squad_dict = read_json_as_dict("../squad_data/train_squad_data.json")

In [73]:
np.random.choice(train_squad_dict['data'])

{'paragraphs': [{'qas': [{'question': 'neutral',
     'id': '1c779efb57',
     'is_impossible': False,
     'answers': [{'text': '_of_the_dead I met a stranger just 10 min ago. He was stalking me at the store',
       'answer_start': 0}]}],
   'context': '_of_the_dead I met a stranger just 10 min ago. He was stalking me at the store'}],
 'title': '1c779efb57'}

### Create KFold Stratified Data

In [75]:
from sklearn.model_selection import StratifiedKFold

In [76]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [79]:
SQUAD_DATA_PATH = Path("../squad_data/")
SQUAD_DATA_PATH.ls()

[PosixPath('../squad_data/train_squad_data.json')]

In [80]:
for foldnum,(trn_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment.values)):
    trn_fold_df = train_df.iloc[trn_idx]
    val_fold_df = train_df.iloc[val_idx]
    train_squad_dict = create_squad_from_df(trn_fold_df)
    valid_squad_dict = create_squad_from_df(val_fold_df)
    save_dict_as_json(train_squad_dict, SQUAD_DATA_PATH/f"train_squad_data_{foldnum}.json")
    save_dict_as_json(valid_squad_dict, SQUAD_DATA_PATH/f"valid_squad_data_{foldnum}.json")

In [108]:
len(train_squad_dict['data']), len(valid_squad_dict['data'])

(21984, 5496)

In [85]:
test_squad_dict =  create_squad_from_df(test_df)
save_dict_as_json(test_squad_dict, SQUAD_DATA_PATH/f"test_squad_data.json")

### Check json files

In [109]:
i = 0
val_ids = []
for i in range(5):
    val_dict = read_json_as_dict(SQUAD_DATA_PATH/f"valid_squad_data_{i}.json")
    val_ids.append(set(map(lambda o:o['paragraphs'][0]['qas'][0]['id'], val_dict['data'])))

In [111]:
for ids in val_ids: print(len(ids))

5496
5496
5496
5496
5496


In [121]:
# all validation ids are unique across files
u = set.intersection(*val_ids)
assert len(u) == 0

### export

In [122]:
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01-squad-utils.ipynb.
