In [1]:
import os
import sys
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format


from collections import defaultdict
import re
from IPython.display import display


# search queries
import json
import urllib

# store data
import pickle

# Longest Common Subsequence
from difflib import SequenceMatcher

# common tokens/words
from fuzzywuzzy import fuzz

# metrics
from sklearn.metrics import roc_auc_score

In [2]:
from pprint import pprint
# from pprint import PrettyPrinter
# pretty = PrettyPrinter(width=30)
# pprint = pretty.pprint

In [3]:
# metric functions

def R_precision_score(results, label):
    R = N_true[label]
    ctr = 0
    for i, res in enumerate(results):
        if i == R:
            break
        opt = res['option']
        if options[opt] == label:
            ctr += 1
    return ctr / R

def diff(a, b):
    if isinstance(a, tuple):
        return tuple(x - y for x, y in zip(a, b))
    else:
        return a - b

def recall_score(results, label, threshold=0.8):
    R = N_true[label]
    ctr = 0
    
    if isinstance(threshold, str) and threshold.startswith('cluster'):
        size_limit = len(results)
        if threshold[7:].isnumeric():
            size_limit = min(int(threshold[7:]), size_limit)
        max_dist, size = None, 0
        for i in reversed(range(1, size_limit)):
            curr_dist = diff(results[i - 1]['score'], 
                             results[i]['score'])
            if not max_dist or curr_dist > max_dist:
                max_dist = curr_dist
                size = i
    else:
        if isinstance(results[0]['score'], tuple):
            threshold = (threshold, -float('inf'))
        for i, res in enumerate(results):
            if res['score'] < threshold:
                size = i
                break

    for i in range(size):
        opt = results[i]['option']
        if options[opt] == label:
            ctr += 1
    return ctr / R, size

def roc_score(results, label):
    y_score = [res['score'] for res in results]
    if isinstance(y_score[0], tuple):
        compressor = defaultdict(list)
        for i, score in enumerate(y_score):
            compressor[score].append(i)
        for code, score in enumerate(sorted(compressor)):
            for i in compressor[score]:
                y_score[i] = code                
    y_true = [int(options[res['option']] == label) for res in results]    
    return roc_auc_score(y_true, y_score)

In [4]:
# search queries

class Google:
    service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
    api_key = open('.api_key').read()
    parameters = {
        'limit': 1,
        'indent': True,
        'key': api_key,
    }
    
    def get_identities(self, queries, extra_params={}):
        params = dict(self.parameters)
        params.update(extra_params)
        
        identities_dict = {}
        for q in queries:
            params['query'] = q
            url = self.service_url + '?' + urllib.parse.urlencode(params)
            response = json.loads(urllib.request.urlopen(url).read())
            for element in response['itemListElement']:
                identity = element['result']['name']
                score = element['resultScore']
                break
            else:
                identity = ''
                score = -1
            # score is currently not used
            identities_dict[q] = identity
        return identities_dict
    
    def get_prefix_identities(self, queries, extra_params={}):
        params = dict(self.parameters)
        params.update(extra_params)
        
        identities_dict = defaultdict(list)
        for q in queries:
            tokens = list(re.split('[^a-zA-Z0-9]+', q))
            for prefix_size in reversed(range(1, len(tokens) + 1)):
                params['query'] = ' '.join(tokens[:prefix_size])
                url = self.service_url + '?' + urllib.parse.urlencode(params)
                response = json.loads(urllib.request.urlopen(url).read())
                identities = [''] * params['limit']
                for i, element in enumerate(response['itemListElement']):
                    identities[i] = element['result']['name']
                    score = element['resultScore'] # score is currently not used
                identities_dict[q].extend(identities)
        return identities_dict
    
    def get_prefix_identities_extended(self, queries, extra_params={}):
        params = dict(self.parameters)
        params.update(extra_params)
        
        identities_dict = defaultdict(list)
        for q in queries:
            tokens = list(re.split('[^a-zA-Z0-9]+', q))
            for prefix_size in reversed(range(1, len(tokens) + 1)):
                params['query'] = ' '.join(tokens[:prefix_size])
                url = self.service_url + '?' + urllib.parse.urlencode(params)
                response = json.loads(urllib.request.urlopen(url).read())
                identities = [''] * (params['limit']<<1)
                for i, element in enumerate(response['itemListElement']):
                    identities[i<<1] = element['result']['name']
                    try:
                        identities[i<<1|1] = element['result']['detailedDescription']['articleBody']
                    except KeyError:
                        pass
                    score = element['resultScore'] # score is currently not used
                identities_dict[q].extend(identities)
        return identities_dict

In [5]:
# Distance metrics

class Distance:
    valid = {'lcs':'Longest common subsequence',
             'fuzzy': 'Common tokens/words',              
             'prefix': 'fuzzy for all prefixes (~ wildcard)'}
    
    def __init__(self, name, acc=4):
        if name not in self.valid:
            raise ValueError(f'Supported distance metrics: {", ".join(self.valid)}')
        self.name = name
        self.acc = acc
    
    def score(self, query, option):
        dist_func = eval(f'self.{self.name}')
        return dist_func(query, option)
    
    def fuzzy(self, query, option):
        score = fuzz.token_set_ratio(query, option) / 100
        return round(score, self.acc), option
    
    def lcs(self, query, option):
        score = SequenceMatcher(None, query, option).ratio()
        return round(score, self.acc), option
    
    def prefix(self, query, option):
        '''
        Maximum among prefixes.
        The less tokens dropped, the better.
        '''
        best = [0, 0]
        best_option = ''
        for n_droped, opt in enumerate(option):
            score = fuzz.token_set_ratio(query, opt) / 100
            if best[0] < score:
                best[0] = score
                best[1] = -n_droped
                best_option = opt
        best[0] = round(best[0], self.acc)
        return tuple(best), best_option

In [6]:
def template(*x):
    return '{:35}{:15}{:10}{:35}'.format(*map(lambda x: x[:34], map(str, x)))

In [7]:
def print_results(queries, distance_name='fuzzy', query_identities={}, option_identities={}, threshold=0.8, 
                  show_extra=2, method_name=None):
    distance = Distance(distance_name)
    key_order = ['option', 'score', 'isTrue', 'identity']
    
    print(template(*key_order))
    print('=' * 70)
    
    for i, q in enumerate(queries):
        label = i + 1
        q_id = query_identities.get(q, q)
        results = []
        res = {}
        for opt, lab in options.items():
            opt_id = option_identities.get(opt, opt)
            score, best_opt_id = distance.score(q_id, opt_id)
            res['option'] = opt
            res['score'] = score
            res['isTrue'] = int(lab == label)
            res['identity'] = best_opt_id
            results.append(res.copy())
        results.sort(key=lambda x: x['score'], reverse=True)

        print(template(f'{q} (query)', '-', '-', q_id))
        print('-' * 70)
        
        R = N_true[label]
        for i in range(min(R + show_extra, n)):
            res = results[i]
            if i == R:
                print('-' * 70)
            print(template(*(res[key] for key in key_order)))

        recall, sample_size = recall_score(results, label=label, threshold=threshold)
        prec = R_precision_score(results, label=label)        
        roc = roc_score(results, label=label)

        print('-' * 70)
        print(f'Recall (t={threshold}, size={sample_size}): {recall:.4f}')
        print(f'R-precision: {prec:.4f}')
        print(f'ROC AUC: {roc:.4f}')
        print('=' * 70)
        
        if method_name:
            for query in queries:
                Results().update(method_name, query, 'recall', recall)
                Results().update(method_name, query, 'Rprec', prec)
                Results().update(method_name, query, 'ROC', roc)

In [8]:
# key words preprocessing

class JunkContainer:
    def __init__(self, junk_list=[]):
        self.junk_list = set(junk_list)
    
    def isin(self, token):
        return token in self.junk_list
    
    def clean(self, option):
        clean_option = ' '.join(t for t in option.split() 
                               if not self.isin(t))
        return clean_option        
    
    def add(self, token):
        self.junk_list.add(token)
    
    def remove(self, token):
        self.junk_list.discard(token)

In [9]:
# store queries results

def save_data(data, filename, check=True):
    if not filename.endswith('.pickle'):
        filename += '.pickle'
    with open(filename, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    if check:
        with open(filename, 'rb') as handle:
            loaded_data = pickle.load(handle)
        assert data == loaded_data

def load_data(filename):
    if not filename.endswith('.pickle'):
        filename += '.pickle'
    with open(filename, 'rb') as handle:
        return pickle.load(handle)    

In [10]:
class Results:
    columns = [f'{q}_{m}' for q in ['test_query_1', 'test_query_2', 'test_query_3'] for m in ['recall', 'Rprec', 'ROC']]
    table = pd.DataFrame(columns=columns)
    table.index.name = 'method'

#         if file_path:
#             self.file_path = file_path
#             self.prepare_file()
    
#     def prepare_file(self):
#         if not os.path.exists(self.file_path):
#             columns = ['method name']
#             for q in self.queries:
#                 for m in self.metrics:
#                     columns.append(f'{q}_{m}')
#             header = ','.join(columns)
#             self.update_file(f'{header}\n')
    
#     def update_file(self, info):
#         if self.file_path:
#             with open(self.file_path, 'a') as handle:
#                 handle.write(info)
    
    def update(self, method, query, metric, score):
        self.table.at[method, f'{query}_{metric}'] = score
    
    def show(self):
        display(self.table)
    

# Input

In [11]:
df = pd.read_csv('input.csv')

class_counts = df.groupby('class').count().sort_index()
N_true = class_counts['company name'].tolist()

keys = list(map(str.lower, df['company name']))
values = df['class']
options = dict(zip(keys, values))

n = len(options)

queries = ['test_query_1', 'test_query_2', 'test_query_3']

# Edit distance

In [12]:
print_results(queries, distance_name='lcs', show_extra=2, threshold='cluster44', method_name='lcs')

In [13]:
Results().show()

### no junk

In [14]:
junk_list = ['ltd', 'limited', 'ltda', 'llc', 
             'gbr', 
             'inc', 'corp', 'corporation', 
             'co', 'company',
             '.']
junk_cont = JunkContainer(junk_list)
clean_options = {opt: junk_cont.clean(opt) for opt in options}

In [15]:
print_results(queries, distance_name='lcs', option_identities=clean_options, show_extra=1, method_name='lcs (no junk)')

In [16]:
Results().show()

# Google

In [17]:
query_identities = Google().get_identities(queries)
print(query_identities)

In [18]:
# option_identities = Google().get_identities(options)
# save_data(option_identities, 'option_identities')

option_identities = load_data('option_identities')
# pprint(option_identities)

In [19]:
found, not_found = [], []
for opt in options:
    identity = option_identities[opt]
    (found if identity else not_found).append(opt)
print(f'Found: {len(found)}')
print(f'Not_found: {len(not_found)}')

In [20]:
print_results(queries, 
              distance_name='fuzzy', 
              query_identities=query_identities, 
              option_identities=option_identities, 
              show_extra=5, 
              method_name='google')

In [21]:
Results().show()

### with {'types': 'Organization'}

In [22]:
extra_params = {'types': 'Organization'}
query_identities_TYPE = Google().get_identities(queries, extra_params)
print(query_identities_TYPE)

In [23]:
# option_identities_TYPE = Google().get_identities(options, extra_params)
# save_data(option_identities_TYPE, 'option_identities_TYPE')

option_identities_TYPE = load_data('option_identities_TYPE')
# pprint(option_identities_TYPE)

In [24]:
found, not_found = [], []
for opt in options:
    identity = option_identities_TYPE[opt]
    (found if identity else not_found).append(opt)
print(f'Found: {len(found)}')
print(f'Not_found: {len(not_found)}')

In [25]:
print_results(queries, 
              distance_name='fuzzy', 
              query_identities=query_identities_TYPE, 
              option_identities=option_identities_TYPE, method_name='google (type=org)')

In [26]:
Results().show()

In [27]:
# among suitable
missing = [opt for opt, identity in option_identities_TYPE.items() 
           if not identity and options[opt]]
pprint(missing)

### search by prefixes

In [28]:
# option_prefix_identities = Google().get_prefix_identities(options, extra_params={'types': 'Organization'})
# save_data(option_prefix_identities, 'option_prefix_identities')

option_prefix_identities = load_data('option_prefix_identities')

In [29]:
found, not_found = [], []
for opt in options:
    identity = option_prefix_identities[opt]
    (found if any(identity) else not_found).append(opt)
print(f'Found: {len(found)}')
print(f'Not_found: {len(not_found)}')

In [30]:
print(not_found)

In [31]:
print_results(queries, 
              distance_name='prefix', 
              query_identities=query_identities_TYPE, 
              option_identities=option_prefix_identities,
              show_extra=100)

In [32]:
Results().show()

### search by prefixes, increase limit to 3

In [33]:
# option_prefix_identities_x3 = Google().get_prefix_identities(options, extra_params={'limit':3, 'types': 'Organization'})
# save_data(option_prefix_identities, 'option_prefix_identities')

option_prefix_identities_x3 = load_data('option_prefix_identities_x3')
# pprint(option_prefix_identities_x3)

In [34]:
print_results(queries, 
              distance_name='prefix', 
              query_identities=query_identities_TYPE, 
              option_identities=option_prefix_identities_x3,
              show_extra=100)

### search by prefixes, increase limit to 3, check also "itemListElement.result.description.articleBody" field

In [35]:
# option_pref_id_x3_extended = Google().get_prefix_identities_extended(options, 
#                                                                      extra_params={'limit':3, 'types': 'Organization'})
# save_data(option_pref_id_x3_extended, 'option_pref_id_x3_extended')

option_pref_id_x3_extended = load_data('option_pref_id_x3_extended')
pprint(option_pref_id_x3_extended)

In [36]:
print_results(queries, 
              distance_name='prefix', 
              query_identities=query_identities_TYPE, 
              option_identities=option_pref_id_x3_extended,
              show_extra=20)

# Currently not used:

### Wikipedia

In [37]:
import wikipedia

In [38]:
for opt in missing:
    if options[opt] > 0:
        print(opt)
        print(wikipedia.search(opt))
        print()

In [39]:
content = wikipedia.page('Test_company').content.lower()
print(content[:500])

In [40]:
mentions = []
tokens = ['token_1', 'token_2', 'token_3']
for q in tokens:
    i = -1
    while True:
        i = content.find(q, i+1)
        if i < 0:
            break
        mentions.append((i, content[i-10:i+40]))
pprint(mentions)

### duckduckgo

In [41]:
# https://api.duckduckgo.com/?q=simpsons+characters&format=json&pretty=1

SERVICE_URL_DUCK = 'https://api.duckduckgo.com'
PARAMS_DUCK = {
    'format': 'json',
    'pretty': 1
}

In [42]:
query = 'query_1'
params = dict(PARAMS_DUCK)
params['q'] = query
url = SERVICE_URL_DUCK + '?' + urllib.parse.urlencode(params)
print(url)

response = json.loads(urllib.request.urlopen(url).read())
identity = response['Heading']
print(identity)

In [43]:
params = dict(PARAMS_DUCK)
for opt in missing:
    query = opt
    params['q'] = query
    url = SERVICE_URL_DUCK + '?' + urllib.parse.urlencode(params)
    response = json.loads(urllib.request.urlopen(url).read())
    identity = response['Heading']
    print(opt)
    print(identity if identity else 'NOT FOUND')
    print()
    