In [1]:
import numpy as np
import pandas as pd
import os
import copy
from tqdm import tqdm

### Load data

In [2]:
dataPath = './origin_data/test_untagged.txt'
full_data = pd.read_csv(dataPath, header=None, sep='\t').iloc[:, 2:]
full_data.columns = ['e1', 'e2', 'rel', 'sen']

### Add entity tag

In [4]:
def _add_entity_tag(row):
    token_sen = row['sen'].split()
    out_token_sen = copy.deepcopy(token_sen)
    update_list_e1 = []
    update_list_e2 = []
    for i, j in enumerate(token_sen):
        if j == row['e1']:
            tmp = i+len(update_list_e1)+len(update_list_e2)
            out_token_sen.insert(tmp, '[E1]')
            out_token_sen.insert(tmp+2, '[/E1]')
            
            update_list_e1.append(tmp)
            update_list_e1.append(tmp+2)
        if j == row['e2']:
            tmp = i+len(update_list_e1)+len(update_list_e2)
            update_list_e2.append(tmp)
            update_list_e2.append(tmp+2)
            out_token_sen.insert(tmp, '[E2]')
            out_token_sen.insert(tmp+2, '[/E2]')
    temp_row = copy.deepcopy(row)
    temp_row['sen'] = ' '.join(out_token_sen)
    return ' '.join(out_token_sen), temp_row

In [5]:
# Function verification
print(_add_entity_tag(full_data.iloc[0]))

('the occasion was suitably exceptional : a reunion of the 1970s-era sam rivers trio , with [E1] dave_holland [/E1] on bass and [E2] barry_altschul [/E2] on drums .', e1                                          dave_holland
e2                                        barry_altschul
rel                                                  NaN
sen    the occasion was suitably exceptional : a reun...
Name: 0, dtype: object)


In [37]:
def prepare_bert_data(dataPath):
    full_data = pd.read_csv(dataPath, header=None, sep='\t').iloc[:, 2:]
    full_data.columns = ['e1', 'e2', 'rel', 'sen']
    tagged_sen = []
    row_list = []
    with tqdm(total=len(full_data)) as pbar:
        for _, row in full_data.iterrows():
            temp_sen, temp_row = _add_entity_tag(row)
            tagged_sen.append(temp_sen)
            if len(temp_row['sen'].split())<512:
                row_list.append(temp_row)
            pbar.update(1)
    full_data.drop(columns='sen')
    full_data['seq'] = tagged_sen
    full_data = full_data.fillna(value='UNK')
    
    cleaned_df = pd.DataFrame(row_list)
    cleaned_df = cleaned_df.fillna(value='UNK')
    cleaned_df = cleaned_df.iloc[:, 2:]
    cleaned_df.to_csv(dataPath[:-4]+'_filtered.txt', index=False, sep='\t')
    full_data.to_csv(dataPath[:-4]+'_bert.txt', index=False, sep='\t')
    

In [38]:
def _clean_text(dataPath):
    output = []
    with open(dataPath, 'r') as origin_file:
        baselen = 0
        n_line = 1

        for line in origin_file.readlines():
            line = line.strip()
            token = line.split('\t')
            if baselen == 0:
                baselen = len(token)
            else:
                if len(token) != baselen:
                    print(token)
                    print(n_line)
            n_line += 1
            temp = '\t'.join(token[:6])+'\n'
            output.append(temp)
    os.rename(dataPath, dataPath[:-4]+'_original.txt')
    with open(dataPath, 'w') as outfile:
        outfile.writelines(output)

In [42]:
prepare_bert_data('./origin_data/test.txt')

100%|██████████| 172448/172448 [02:30<00:00, 1144.63it/s]


In [21]:

tagged_sen = []
with tqdm(total=len(full_data)) as pbar:
    for _, row in full_data.iterrows():
        tagged_sen.append(_add_entity_tag(row))
        pbar.update(1)

100%|██████████| 70730/70730 [00:48<00:00, 1449.71it/s]


In [24]:
print(full_data.iloc[0]['tagged'])

sen. charles e. schumer called on federal safety officials yesterday to reopen their investigation into the fatal crash of a passenger jet in [E2] belle_harbor [/E2] , [E1] queens [/E1] , because equipment failure , not pilot error , might have been the cause .


In [49]:
def convert_filter(dataPath):
    df = pd.read_csv(dataPath, sep='\t', header=None)
    df.columns=['labels', 'text']
#    df.to_json(dataPath[:-3]+'json', orient='records')
    df.to_json(dataPath[:-3]+'json')

In [50]:
convert_filter('./origin_data/train_filtered.txt')

In [24]:
def filter_long(path):
    df = pd.read_csv(path, header=None, sep='\t')
    temp = []
    for _, row in df.iterrows():
        token = row.iloc[-1].split()
        if len(token)<480:
            temp.append(row)
        else:
            print(len(token))
    print(len(temp))
    print(len(df))

In [25]:
filter_long('./origin_data/test.txt')

172448
172448


In [55]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
def convert_label(path):
    
    df = pd.read_csv(path, header=None, sep='\t')
    if not hasattr(le, 'classes_'):
        le.fit(df.iloc[:, 0])
    df.iloc[:, 0] = le.transform(df.iloc[:, 0])
    
    df.to_csv(path, header=False, index=False, sep='\t')

In [56]:
convert_label('./origin_data/train_filtered.txt')

In [51]:
print(hasattr(le, 'classes_'))

False


In [54]:
convert_label('./origin_data/test_filtered.txt')