In [None]:
import os
import sys
import requests
import pandas as pd
import pickle
import nltk
import inflect
from tqdm import tqdm
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import Parallel, delayed
from requests.adapters import HTTPAdapter, Retry
sys.path.insert(0, 'nlp_architect/models/aspect_extraction_with_kg')
import absa_utils

In [None]:
data_path = 'nlp_architect/models/aspect_extraction_with_kg/data/csv/' # includes the re-labeled devices dataset
# data_path = 'nlp_architect/models/aspect_extraction_with_kg/data/csv_original/' # use this path for the original devices dataset
domains = ['laptops','restaurants','device']

### Generate seed terms from unlabeled text using TF-IDF

In [None]:
docs = {}
tfidf = {}
for target in domains:
    docs[target] = []
    source = [i for i in domains if i != target][0]
    folders = [source + '_to_' + target + '_1', source + '_to_' + target + '_2', source + '_to_' + target + '_3']
    for folder in folders:
        examples = absa_utils.read_examples_from_file(data_dir=(data_path + folder), mode='test')
        nouns = [' '.join([i.words[w] for w in range(len(i.words)) if i.pos_tags[w] in ['NN', 'NNS', 'NNP', 'compound']]).lower() for i in examples]
        docs[target] = docs[target] + nouns

all_docs = []
for target in docs.keys():
    all_docs += docs[target]
    
vectorizer = TfidfVectorizer(lowercase = True, stop_words = 'english', ngram_range = (1, 1), binary=False)
X = vectorizer.fit_transform(all_docs)
feature_names = vectorizer.get_feature_names()
dense = X.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

tfidf[domains[0]] = df.iloc[:len(docs[domains[0]])].agg('mean').sort_values(ascending=False)
tfidf[domains[1]] = df.iloc[len(docs[domains[0]]):(len(docs[domains[0]]) + len(docs[domains[1]]))].agg('mean').sort_values(ascending=False)
tfidf[domains[2]] = df.iloc[-len(docs[domains[2]]):].agg('mean').sort_values(ascending=False)

In [None]:
remove_tokens = list(set(tfidf['device'][:11].index).intersection(tfidf['restaurants'][:11].index).intersection(tfidf['device'][:11].index))
for target in domains:
    tfidf[target] = tfidf[target].loc[~tfidf[target].index.isin(remove_tokens)][:10]

### Query domain-specific KGs from ConceptNet

In [None]:
def query_edges(query_id, depth, max_edges, rel_types, cn_url, s):
    edges = []
    for rel in rel_types:
        query_url = os.path.join(cn_url, 'query?node=' + query_id + '&other=/c/en')
        if rel is not None:
            query_url = query_url + '&rel=' + rel
        response = s.get(query_url).json()
        edges = edges + response['edges']
        page = 1
        while 'view' in response.keys() and 'nextPage' in response['view'].keys() and 20*page < max_edges:
            response = s.get(cn_url + response['view']['nextPage']).json()
            edges = edges + response['edges']
            page += 1
    for edge in edges:
        edge['depth'] = depth
    edges = [e for e in edges if e['rel']['label'] != 'ExternalURL']
    return edges

def query_relatedness(word1, word2, cn_url, s):
    response = s.get(os.path.join(cn_url, 'relatedness?node1=/c/en/' + word1 + '&node2=/c/en/' + word2)).json()
    if 'value' in response.keys():
        score = response['value']
    else:
        score = 0
    return score

def query_subgraph(seed_terms, max_edges, max_depth, min_relatedness, rel_types, cn_url):
    seed_dist = []
    seed_edges = []
    
    s = requests.Session()
    retries = Retry(total=5,
                backoff_factor=0.1,
                status_forcelist=[ 500, 502, 503, 504 ])
    s.mount('http://', HTTPAdapter(max_retries=retries))

    for seed in tqdm(seed_terms):
        edges = query_edges('/c/en/' + seed, 1, max_edges, rel_types, cn_url, s)
        n = 0
        node_dist = {k: [] for k in range(1, max_depth+1)}
        node_edges = {k: [] for k in range(1, max_depth+1)}
        while len(edges) > 0:
            edge = edges.pop(0)
            for node in ['start', 'end']:
                token = edge[node]['label']
                if token != seed and (('language' not in edge[node].keys()) or (edge[node]['language'] == 'en')) and token not in node_dist[edge['depth']]:
                    r_score = query_relatedness(token, seed, cn_url, s)
                    if r_score >= min_relatedness or edge['depth'] == 1:
                        node_dist[edge['depth']].append(token)
                        node_edges[edge['depth']].append(edge)
                        if edge['depth'] + 1 <= max_depth:
                            add_edges = query_edges(edge[node]['@id'], edge['depth'] + 1, max_edges, rel_types, cn_url, s)
                            edges = edges + add_edges
            n+=1
        seed_dist.append(node_dist)
        seed_edges.append(node_edges)
        print(str(datetime.now()) + '\t' + seed + '\texplored ' + str(n) + ' edges at a max depth of ' + str(max_depth))
        
    return seed_dist, seed_edges

In [None]:
cn_url = cn_url = 'http://api.conceptnet.io'
max_edges = float('inf')
max_depth = 2
min_relatedness = 0.2
rel_types = [None]

In [None]:
result_list = Parallel(n_jobs=3)(delayed(query_subgraph)(seed_terms, max_edges, max_depth, min_relatedness, rel_types, cn_url) for seed_terms in [list(tfidf[domain].index) for domain in domains])

In [None]:
seed_dist = {}; seed_edges = {}
for i in range(len(domains)):
    seed_dist[domains[i]], seed_edges[domains[i]] = result_list[i]

### Augment ConceptNet subgraph with generations from COMET

In [None]:
def load_comet_results(comet_path):
    comet_results = pickle.load(open(comet_path, 'rb'))
    comet_results = pd.DataFrame(comet_results)
    comet_candidates = []
    for i in comet_results['beams']:
        comet_candidates += i
    comet_candidates = [i for i in comet_candidates if i not in nltk.corpus.stopwords.words('english')]
    return comet_candidates

In [None]:
comet_candidates_laptops = load_comet_results('comet-generations/laptops.pickle')
seed_dist['laptops'][0][1] = list(set(seed_dist['laptops'][0][1] + comet_candidates_laptops))

In [None]:
comet_candidates_device = load_comet_results('comet-generations/device.pickle')
seed_dist['device'][0][1] = list(set(seed_dist['device'][0][1] + comet_candidates_device))

In [None]:
comet_candidates_restaurants = load_comet_results('comet-generations/restaurants.pickle')
seed_dist['restaurants'][0][1] = list(set(seed_dist['restaurants'][0][1] + comet_candidates_restaurants))

### Add plural forms to knowledge graphs

In [None]:
engine = inflect.engine()

In [None]:
def pluralize_kg(seed_dist):
    for i in range(len(seed_dist)):
        for k in seed_dist[i].keys():
            for j in range(len(seed_dist[i][k])):
                tk = seed_dist[i][k][j].split()
                if len(tk) == 1:
                    add_tk = engine.plural(tk[0])
                else:
                    add_tk = ' '.join(tk[:-1] + [engine.plural(tk[-1])])
                seed_dist[i][k] += [add_tk]
    return seed_dist

In [None]:
for domain in seed_dist.keys():
    seed_dist[domain] = pluralize_kg(seed_dist[domain])

### Save domain-specific KGs and seed terms

In [None]:
pickle.dump({k : list(tfidf[k].index) for k in tfidf.keys()}, open('seed_terms.pkl', 'wb'))
pickle.dump(seed_dist, open('seed_dist.pkl', 'wb'))