In [None]:
import sys
import os
import pickle
import itertools
import csv
import pandas as pd
import numpy as np
from shutil import copyfile
from typing import List
from pathlib import Path
sys.path.insert(0, 'nlp_architect/models/aspect_extraction_with_kg')
import absa_utils
from absa_utils import InputExample

In [None]:
data_path = 'nlp_architect/models/aspect_extraction_with_kg/data/csv/' # includes the re-labeled devices dataset
# data_path = 'nlp_architect/models/aspect_extraction_with_kg/data/csv_original/' # use this path for the original devices dataset
out_path = 'nlp_architect/models/aspect_extraction_with_kg/data/knowledge_injected/'
domains = ['laptops','restaurants','device']

### Load domain-specific KGs

In [None]:
seed_terms = pickle.load(open('seed_terms.pkl', 'rb'))
seed_dist = pickle.load(open('seed_dist.pkl', 'rb'))

### Insert pivot tokens into datasets

In [None]:
def example_to_df(example: InputExample) -> pd.DataFrame:
    header =['TOKEN','LABEL','HEAD','HEAD_WORD','DEP_REL','POS','SUB_TOKENS']
    df= pd.DataFrame(columns=header)
    df['TOKEN'] = example.words
    df['LABEL'] = example.labels
    df['HEAD']=example.heads
    df['HEAD_WORD']=example.head_words
    df['DEP_REL']=example.syn_rels
    df['POS']=example.pos_tags
    df['SUB_TOKENS']=[' '.join(el) if len(el)>1 else '' for el in example.sub_toks]
    
    return df.astype(str)

def concate_mw_pivot_phrase(df: pd.DataFrame, indices: List[int]) -> None:
    df2 = df.copy()
    increment = 0
    for item in indices:
        index = item[0]
        index = index + increment
        line = pd.DataFrame({'TOKEN': [item[1]],
                             'LABEL': ['O'],
                             'HEAD': [df2.iloc[index-1]['HEAD']],
                             'HEAD_WORD': [df2.iloc[index-1]['HEAD_WORD']],
                             'DEP_REL': [df2.iloc[index-1]['DEP_REL']],
                             'POS': [df2.iloc[index-1]['POS']],
                             'SUB_TOKENS': [df2.iloc[index-1]['SUB_TOKENS']]
                            
                            },
                            
                            index=[index])
        df2 = pd.concat([df2.iloc[:index], line, df2.iloc[index:]]).reset_index(drop=True)
        increment+=1
        
    return df2

def get_cn_mw_pivot_phrase_indices(df: pd.DataFrame, candidate_pos: List[str], seed_dist, pvb, pvi, kg_pct) -> List[int]:
    indices_for_pivot_phrase = []
    seed_dist = seed_dist.copy()
    for j in range(len(seed_dist)):
        seed_dist[j] = list(itertools.chain.from_iterable([seed_dist[j][i] for i in seed_dist[j].keys()]))
    seed_dist = list(itertools.chain.from_iterable(seed_dist))
    if kg_pct < 1:
        np.random.shuffle(seed_dist)
        seed_dist = seed_dist[:int(len(seed_dist)*kg_pct)]
    
    row_iterator = df.iterrows()
    index, last = next(row_iterator) # take first item from row_iterator
    compound_start = None
    amod_start = None
    for i, row in row_iterator:
        asp_candidate = None
        if last['DEP_REL'] == 'compound':
            if compound_start is None:
                compound_start = index
        elif compound_start is not None:
            asp_candidate = ' '.join(df[compound_start:i]['TOKEN'].values)
        elif last['DEP_REL'] == 'amod':
            if amod_start is None:
                amod_start = index
        elif (last['POS'] in candidate_pos):
            if amod_start is None:
                asp_candidate =  last['TOKEN']
            else:
                asp_candidate = ' '.join(df[amod_start:i]['TOKEN'].values)
        else:
            amod_start = None

        if asp_candidate is not None:
            path_found = False
            for k in range(len(asp_candidate.split())):
                if ' '.join(asp_candidate.split()[k:]) in seed_dist:
                    path_found = True
                    break
            if path_found:
                if compound_start is not None or amod_start is not None:
                    if amod_start is not None:
                        amod_start += k
                    mw_start = [s for s in [compound_start, amod_start] if s is not None][0]
                    indices_for_pivot_phrase.append([mw_start+1, pvb])
                    indices_for_pivot_phrase += [[k, pvi] for k in range(mw_start+2, i+1)]
                else:
                    indices_for_pivot_phrase.append([i, pvb])
            compound_start = None
            amod_start = None
        last = row
        index = i
        
    return indices_for_pivot_phrase

def get_stochastic_mw_gold_label_pivot_phrase_indices(df, r, fpr, pvb, pvi, candidate_pos):
    indices_for_pivot_phrase = []
    row_iterator = df.iterrows()
    index, last = next(row_iterator) # take first item from row_iterator
    urand_p = np.random.uniform()
    for i, row in row_iterator:
        if last['LABEL'] == 'B-ASP' and urand_p <= r:
            indices_for_pivot_phrase.append([i,pvb])
        elif last['LABEL'] == 'I-ASP' and urand_p <= r:
            indices_for_pivot_phrase.append([i,pvi])
        elif np.random.uniform() <= fpr:
            if np.random.uniform() <= 0.5:
                indices_for_pivot_phrase.append([i,pvb])
            else:
                indices_for_pivot_phrase.append([i,pvi])
        last = row
        index = i
        
    return indices_for_pivot_phrase

def write_example_cn_pivotPhrase_to_file(examples: List[InputExample], file: str, candidate_pos: List[str], seed_dist, domain, mode, p, r, pvb, pvi, pred_dict, prob_thresh, kg_pct) -> None:
    
    if mode == 'train':
        df_all = pd.concat([example_to_df(e) for e in examples])
        num_pos = df_all['LABEL'].value_counts()[['B-ASP','I-ASP']].sum()
        num_neg = df_all['LABEL'].value_counts().sum() - num_pos
        tp = r * num_pos
        fp = (tp / p) - tp
        fpr = fp / num_neg
    
    with open(file, 'w',  encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['TOKEN', 'LABEL', 'HEAD', 'HEAD_WORD', 'DEP_REL', 'POS', 'SUB_TOKENS'])
        
        for i in range(len(examples)):
            example = examples[i]
            df = example_to_df(example)
            df['TOKEN'] = df['TOKEN'].str.lower()
            if mode == 'train':
                indices = get_stochastic_mw_gold_label_pivot_phrase_indices(df, r, fpr, pvb, pvi, candidate_pos)
            else:
                if pred_dict is None or pred_dict['prob'][i] > prob_thresh:
                    indices = get_cn_mw_pivot_phrase_indices(df, candidate_pos, seed_dist, pvb, pvi, kg_pct)
                else:
                    indices = []
            df = concate_mw_pivot_phrase(df, indices)
            for index, row in df.iterrows():
                writer.writerow(row.values)
            writer.writerow(['_'] * 7)
            writer.writerow(['_'] * 7)
            
def score_mw_pivot_insertion_accuracy(examples, pvb, pvi):
    tp = []; fn = []; fp = []; tn = []
    
    for example in examples:
        example_df = example_to_df(example)
        for i in range(example_df.shape[0]):
            token = example_df.iloc[i, example_df.columns == 'TOKEN'].values[0]
            pos = example_df.iloc[i, example_df.columns == 'POS'].values[0]
            dep_rel = example_df.iloc[i, example_df.columns == 'DEP_REL'].values[0]
            pos_dep_rel = pos + '_' + dep_rel
            label = example_df.iloc[i, example_df.columns == 'LABEL'].values[0]
            prediction = example_df.iloc[min(i+1, example_df.shape[0]-1), example_df.columns == 'TOKEN'].values[0]
            if label == 'B-ASP':
                if prediction == pvb:
                    tp.append([token, example_df, pos, dep_rel, pos_dep_rel, label, prediction])
                else:
                    fn.append([token, example_df, pos, dep_rel, pos_dep_rel, label, prediction])
            elif label == 'I-ASP':
                if prediction == pvi:
                    tp.append([token, example_df, pos, dep_rel, pos_dep_rel, label, prediction])
                else:
                    fn.append([token, example_df, pos, dep_rel, pos_dep_rel, label, prediction])
            else:
                if prediction in [pvb, pvi]:
                    fp.append([token, example_df, pos, dep_rel, pos_dep_rel, label, prediction])
                else:
                    tn.append([token, example_df, pos, dep_rel, pos_dep_rel, label, prediction])

    precision = len(tp) / (len(tp) + len(fp))
    recall = len(tp) / (len(tp) + len(fn))
    f1 = len(tp) / (len(tp) + 0.5*(len(fp) + len(fn)))
    
    return [precision, recall, f1, len(tp), len(fp), len(fn), len(tn)], {'tp' : tp, 'fp' : fp, 'fn' : fn, 'tn' : tn}

In [None]:
np.random.seed(7)
kg_pct_test = 1
candidate_pos = ['NN', 'NNS', 'NNP']

for model_name in ['bert', 'deberta']:
    if model_name == 'bert':
        pvb = 'reltodomainb'
        pvi = 'reltodomaini'
    elif model_name == 'deberta':
        pvb = '--------------------------------'
        pvi = '----------------------------------------------------------------'
    else:
        raise NotImplementedError
        
    for source in domains:
        for target in set(domains).difference([source]):
            for split in range(1, 4):
                in_folder = os.path.join(source + '_to_' + target + '_' + str(split))
                out_folder = os.path.join(model_name, in_folder)
                Path(os.path.join(out_path, out_folder)).mkdir(parents=True, exist_ok=True)
                
                # Prepare the dev dataset
                examples = absa_utils.read_examples_from_file(data_dir=(os.path.join(data_path, in_folder)), mode='dev')
                write_example_cn_pivotPhrase_to_file(examples, os.path.join(out_path, out_folder, 'dev.csv'), candidate_pos, 
                                                     seed_dist[target], domain = target, mode='dev', p = None, 
                                                     r = None, pvb = pvb, pvi = pvi, pred_dict = None, prob_thresh = None, kg_pct = 1)
                
                # Prepare the test dataset
                examples = absa_utils.read_examples_from_file(data_dir=(os.path.join(data_path, in_folder)), mode='test')
                write_example_cn_pivotPhrase_to_file(examples, os.path.join(out_path, out_folder, 'test.csv'), candidate_pos, 
                                                     seed_dist[target], domain = target, mode='test', p = None, 
                                                     r = None, pvb = pvb, pvi = pvi, pred_dict = None, prob_thresh = None, kg_pct = kg_pct_test)
                
                # Estimate precision and recall of knowledge injection on the dev dataset
                examples = absa_utils.read_examples_from_file(data_dir=os.path.join(out_path, out_folder), mode='dev')
                results, examples_dict = score_mw_pivot_insertion_accuracy(examples, pvb, pvi)
                results = pd.DataFrame(results).transpose()
                results.columns = ['precision', 'recall', 'f1', 'tp', 'fp', 'fn', 'tn']
                
                # Prepare the train dataset
                examples = absa_utils.read_examples_from_file(data_dir=(os.path.join(data_path, in_folder)), mode='train')
                write_example_cn_pivotPhrase_to_file(examples, os.path.join(out_path, out_folder, 'train.csv'), candidate_pos, 
                                                     seed_dist[source], domain = source, mode='train', p = results['precision'][0], 
                                                     r = results['recall'][0], pvb = pvb, pvi = pvi, pred_dict = None, prob_thresh = None, kg_pct = 1)

    copyfile(os.path.join(data_path, 'labels.txt'), os.path.join(out_path, model_name, 'labels.txt'))