In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import gc
print(os.listdir("../input"))

import spacy

# Any results you write to the current directory are saved as output.

['gap-coreference', 'gendered-pronoun-resolution']


In [2]:
DATA_ROOT = '../input/'
GAP_DATA_FOLDER = os.path.join(DATA_ROOT, 'gap-coreference')
SUB_DATA_FOLDER = os.path.join(DATA_ROOT, 'gendered-pronoun-resolution')

# Import Data

In [3]:
test_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-development.tsv')
train_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-test.tsv')
dev_df_path = os.path.join(GAP_DATA_FOLDER, 'gap-validation.tsv')

train_df = pd.read_csv(train_df_path, sep='\t')
test_df = pd.read_csv(test_df_path, sep='\t')
dev_df = pd.read_csv(dev_df_path, sep='\t')

df = pd.concat([train_df, dev_df, test_df], axis=0, ignore_index=True)
train_df.reset_index(drop=True, inplace=True)

del train_df, test_df, dev_df
gc.collect()

df = df.sample(frac=1).reset_index(drop=True)

## TODO: Load Test Data in The Second Phase

In [4]:
test_df_path2 = os.path.join(GAP_DATA_FOLDER, 'gap-development.tsv')
test_df = pd.read_csv(test_df_path2, sep='\t')

# Preprocessing

In [5]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger', 'ner', 'textcat'])

## Clean Text

### Clean up Entity Names
Replace Entity Names A and B by Alice and Bob.

In [6]:
AT_NAME = "AAAAAAXXXXXXXXX"
BT_NAME = "BBBBBBXXXXXXXXX"
A_NAME = 'John'
B_NAME = 'Bob'

def find_all_substring(a_str, sub):
    start = 0
    result = list()
    while True:
        start = a_str.find(sub, start)
        if start == -1:
            return result
        result.append(start)
        start += len(sub) # use start += 1 to find overlapping matches

def _update_offset(text, old_, new_, offset):
    len_in = len(new_) - len(old_)
    text_ = text[0:offset]
    return offset + len_in * len(find_all_substring(text_, old_))
    
def replace_entity_name(text, a_name, b_name, a_offset, b_offset, p_offset):
    a_name = a_name.strip()
    b_name = b_name.strip()
    
    if len(a_name) < len(b_name):
        a_name, b_name = b_name, a_name
        AT_NAME_, BT_NAME_ = BT_NAME, AT_NAME
        A_NAME_, B_NAME_ = B_NAME, A_NAME
    else:
        AT_NAME_, BT_NAME_ = AT_NAME, BT_NAME
        A_NAME_, B_NAME_ = A_NAME, B_NAME
    
    # replace the whole name
    a_offset = _update_offset(text, a_name, AT_NAME_, a_offset)
    b_offset = _update_offset(text, a_name, AT_NAME_, b_offset)
    p_offset = _update_offset(text, a_name, AT_NAME_, p_offset)
    text = text.replace(a_name, AT_NAME_)

    a_offset = _update_offset(text, b_name, BT_NAME_, a_offset)
    b_offset = _update_offset(text, b_name, BT_NAME_, b_offset)
    p_offset = _update_offset(text, b_name, BT_NAME_, p_offset)
    text = text.replace(b_name, BT_NAME_)
    
    # replace sub name
    a_name_list = a_name.split(" ")
    b_name_list = b_name.split(" ")
    for a_subname in a_name_list:
        a_offset = _update_offset(text, a_subname, AT_NAME_, a_offset)
        b_offset = _update_offset(text, a_subname, AT_NAME_, b_offset)
        p_offset = _update_offset(text, a_subname, AT_NAME_, p_offset)
        text = text.replace(a_subname, AT_NAME_)
    for b_subname in b_name_list:
        a_offset = _update_offset(text, b_subname, BT_NAME_, a_offset)
        b_offset = _update_offset(text, b_subname, BT_NAME_, b_offset)
        p_offset = _update_offset(text, b_subname, BT_NAME_, p_offset)
        text = text.replace(b_subname, BT_NAME_)
    
    # remove suffix
    # replace the whole name
    a_offset = _update_offset(text, AT_NAME_, A_NAME_, a_offset)
    b_offset = _update_offset(text, AT_NAME_, A_NAME_, b_offset)
    p_offset = _update_offset(text, AT_NAME_, A_NAME_, p_offset)
    text = text.replace(AT_NAME_, A_NAME_)

    a_offset = _update_offset(text, BT_NAME_, B_NAME_, a_offset)
    b_offset = _update_offset(text, BT_NAME_, B_NAME_, b_offset)
    p_offset = _update_offset(text, BT_NAME_, B_NAME_, p_offset)
    text = text.replace(BT_NAME_, B_NAME_)
    
    if len(a_name) < len(b_name):
        a_offset, b_offset = b_offset, a_offset
    
    return text, a_offset, b_offset, p_offset

In [7]:
def entity_replace_func(row):
    text, a_offset, b_offset, p_offset = replace_entity_name(
        row['Text'], row['A'], row['B'], row['A-offset'], row['B-offset'], row['Pronoun-offset']
    )
    
    row_ = row.copy()
    row_['Text'] = text
    row_['A'] = A_NAME
    row_['B'] = B_NAME
    row_['A-offset'] = a_offset
    row_['B-offset'] = b_offset
    row_['Pronoun-offset'] = p_offset
    
    return row_

In [8]:
df = df.apply(entity_replace_func, axis=1)
test_df = test_df.apply(entity_replace_func, axis=1)

In [9]:
df.to_csv("gap-phase1.csv", index=False)
test_df.to_csv("gap-phase2.csv", index=False)