# Merge Encoded & Found (NER) Entities

Joining NER with parsed XML to create list of entities.

In [1]:
# Import necessary libraries.
import re, warnings, csv, sys, os, spacy
import pandas as pd
import numpy as np
import seaborn as sns
from itertools import chain
from fuzzywuzzy import fuzz, process

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/SemanticData/"

# Ignore warnings related to deprecated functions.
warnings.simplefilter("ignore", DeprecationWarning)

# Import spaCy language model.
nlp = spacy.load('en_core_web_sm')

## Read-in Data and Find Entities

In [2]:
%%time

# Read in file; select columns; drop rows with NA values (entries without a named person).
df = pd.read_csv(abs_dir + 'Output/ParsedXML/JQA_dataframe.txt',
                 sep = '\t') \
    .dropna() \
    .rename(columns = {'people':'refKey'})

# Add or substract labels to list for NER to find.
label_list = ['PERSON', 'LOC']

# spaCy
def get_spacy_entities(text, label_list):
    sp_entities_l = []
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in label_list:
            sp_entities_l.append((str(ent), ent.label_))
        else:
            pass
    return sp_entities_l

# Apply function to find spaCy-based entities.
df['found_entities'] = df.apply(lambda row: get_spacy_entities(row['text'], label_list),
                                axis = 1)

# Keep encoded entities as a list for matching further down.
df['encoded_entities'] = df['refKey']

# Split string of people into individuals.
df['refKey'] = df['refKey'].str.split(r',|;')

# Explode list so that each list value becomes a row.
df = df.explode('refKey')

# Clean up entities (refKeys) and found entities.
df['refKey'] = df['refKey'].str.replace('-', ' ')

# Lowercase found entity (i) in each tuple (x) in 'found_entity' column.
df['found_entities'] = df['found_entities'].apply(lambda x: [(i.lower(), j) for i,j in x])

# Drop 'text' column to reduce dataframe size.
df = df.drop(columns = ['text'])

df.head()

CPU times: user 1min 8s, sys: 1.65 s, total: 1min 9s
Wall time: 1min 11s


Unnamed: 0,file,entry,date,refKey,found_entities,encoded_entities
9,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,herkimer john,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m..."
9,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,adams parmenio,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m..."
9,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,ketchum unknown,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m..."
9,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,meyer unknown3,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m..."
9,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,adams daniel,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m..."


#### Determine & Conflate Encoded-Found Entity Matches

In [3]:
%%time

# Create function to find matches and their confidence scores.
def get_matches_and_quality(entity_string, entity_list):
    match = process.extract(entity_string,
                            entity_list,
                            limit = 1,
                            scorer = fuzz.token_set_ratio)
    refKey = re.sub(' ', '-', match[0][0][0])
    label = match[0][0][1]
    confidence = match[0][1]
    return (refKey, label, confidence)

# Determining whether an encoded entity matches a found one and vice-versa.
# 1. Function to return an encoded entity and its best match, then declare quality of match.
# 2. (Inverse) function to find if a found entity has no matches (and therefore is new data).

# 1.
def get_entity2found_matches(column_one, column_two, confidence_threshold):
    try:
        match = get_matches_and_quality(column_one, column_two)

        refKey = re.sub(' ', '-', match[0])
        label = match[1]
        confidence = match[2]
        
        if confidence >= confidence_threshold:
            match_quality = 'match'
        else:
            match_quality = 'only_encoded'

        return (refKey, label, confidence, match_quality)
    except RuntimeError:
        pass

matches_df = df.assign(matches = df \
    .apply(lambda row: get_entity2found_matches(row['refKey'],
                                                row['found_entities'], 50), axis = 1)
                   )

# Untuple Results.
# Matched_entity should not declare matched; new isn't true either; added_entity?
matches_df[['added_entity', 'ner_label', 'confidence', 'match_quality']] = pd \
    .DataFrame(matches_df['matches'].values.tolist(),
               index = matches_df.index)

# 2. 
def get_found2entity_matches(column_one, column_two, confidence_threshold):
    for entity in column_one:

        refKey = re.sub(' ', '-', entity[0])
        label = entity[1]
        confidence = fuzz.token_sort_ratio(entity[0], column_two)
        
        if confidence >= confidence_threshold:
            continue
        else:
            match_quality = 'only_found'
            return (refKey, label, confidence, match_quality)

# Unlike previous function, this inverse function needs to add rows (rather than columns).
for index, row in matches_df.iterrows():
    match = get_found2entity_matches(row['found_entities'], row['refKey'], 50)
    
    if match is not None:
        refKey = re.sub(' ', '-', match[0])
        label = match[1]
        confidence = match[2]
        match_quality = match[3]

        matches_df = matches_df.append({'file':row['file'], 'entry':row['entry'],
                                        'date':row['date'], 'refKey':row['refKey'],
                                        'found_entities':row['found_entities'],
                                        'encoded_entities':row['encoded_entities'],
                                        'matches':row['matches'],
                                        'added_entity':refKey,
                                        'ner_label':label,
                                        'confidence':confidence,
                                        'match_quality': match_quality},
                                       ignore_index = True)

    else:
        continue

        
# Converge 'refKey' and 'added_entity' based on 'match_quality'.
# If 'match_quality' equals 'match,' then refKey becomes the referent (to disambiguate).
# The same is true if 'match_quality' equals 'only_encoded'.
# If 'match_quality' equal 'not_match,' then 'added_entity' becomes the referent.
def converge_entities(match_quality, refKey_column, added_entity_column):
    if match_quality == 'match' or match_quality == 'only_encoded':
        value = refKey_column
    else:
        value = added_entity_column
    return value

# Create new column, 'referent'
# Selects appropriate choice between refKey and added_entity to be referent.
matches_df['label'] = matches_df.apply(lambda row: converge_entities(row['match_quality'],
                                                                     row['refKey'],
                                                                     row['added_entity']),
                                       axis = 1)

matches_df.head()

CPU times: user 6min 16s, sys: 9.15 s, total: 6min 25s
Wall time: 6min 32s


Unnamed: 0,file,entry,date,refKey,found_entities,encoded_entities,matches,added_entity,ner_label,confidence,match_quality,label
0,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,herkimer john,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m...","(john-herkimer, PERSON, 100, match)",john-herkimer,PERSON,100.0,match,herkimer john
1,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,adams parmenio,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m...","(parmenio-adams-members-h.r., PERSON, 100, match)",parmenio-adams-members-h.r.,PERSON,100.0,match,adams parmenio
2,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,ketchum unknown,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m...","(a-mr-ketchum, PERSON, 64, match)",a-mr-ketchum,PERSON,64.0,match,ketchum unknown
3,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,meyer unknown3,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m...","(meyer, PERSON, 62, match)",meyer,PERSON,62.0,match,meyer unknown3
4,"('JQADiaries-v49-1825-01-p795.xml',)",jqadiaries-v49-1825-01-10,1825-01-10,adams daniel,"[(john herkimer, PERSON), (parmenio adams memb...","herkimer-john,adams-parmenio,ketchum-unknown,m...","(parmenio-adams-members-h.r., PERSON, 59, match)",parmenio-adams-members-h.r.,PERSON,59.0,match,adams daniel


## Find Correlations

In [4]:
# %%time

# # Subset data by columns.
# corr_df = matches_df[['entry', 'label']]

# # Create entry-person matrix.
# corr_df = pd.crosstab(corr_df['entry'], corr_df['label'])

# # Convert entry-person matrix into an adjacency matrix of persons.
# corr_df = corr_df.T.dot(corr_df)

# # Change diagonal values to zero. That is, a person cannot co-occur with themself.
# np.fill_diagonal(corr_df.values, 0)

# # Simple correlation matrix from dataframe.
# corr_df = corr_df.corr()

# # Create new 'source' column that corresponds to index (person).
# corr_df['label_src'] = corr_df.index

# # Reshape dataframe to focus on source, target, and weight.
# # Remove same-person pairs (weight = 1) and low correlations (weight >= 0.7).
# # 0.4 Correlation Coefficient (weigh) considered 'moderate' in Dancey & Reidy (psychology)
# # and 'strong' in Quinnipiac Univeristy (politics).
# corr_df = pd.melt(corr_df, id_vars = ['label_src'], value_name = 'weight') \
#     .query('(weight < 1.00) & (weight >= 0.4)')  \
#     .rename(columns = {'label':'target', 'label_src':'label'})

# # Rejoin source with its ner label.
# corr_df = corr_df \
#     .merge(matches_df[['label', 'ner_label', 'match_quality']],
#            on = 'label', how = 'left') \
#     .drop_duplicates()


# print (corr_df.shape)
# corr_df.head()

## Convert Data to Network Structure

In [5]:
# %%time

# # Create list of unique entities from source and target columns.
# nodes = pd.DataFrame(corr_df['label'].values.tolist() + corr_df['target'].values.tolist()) \
#     .rename(columns = {0:'label'}) \
#     .drop_duplicates()

# # Create identifying codes for labels.
# nodes = nodes \
#     .assign(source = nodes['label'].astype('category').cat.codes) \
#     .dropna() \
#     .sort_values(['source'], ascending = True) # Sorting matches labels with source codes.

# # Create dictionary to map values to codes.
# nodes_dictionary = nodes.set_index('label')['source'].to_dict()

# # Create links dataframe and map links to nodes' codes.
# links = corr_df \
#     .assign(source = corr_df['label'].map(nodes_dictionary),
#             target = corr_df['target'].map(nodes_dictionary))

# # Add data to nodes dataframe.
# nodes = nodes.merge(links[['label', 'ner_label', 'match_quality']],
#            on = 'label', how = 'left') \
#     .drop_duplicates()

# print (links.shape)
# links.head()

## Write to File

In [6]:
%%time

matches_df.to_csv(abs_dir + "Output/Graphs/JQA_Network_mergedEntities-correlation/network-mergedEntities.csv",
             sep = ',', index = False)

# nodes.to_csv(abs_dir + "Output/Graphs/JQA_Network_mergedEntities-correlation/nodes.csv",
#              sep = ',', index = False)

# links.to_csv(abs_dir + "Output/Graphs/JQA_Network_mergedEntities-correlation/links.csv",
#              sep = ',', index = False)

CPU times: user 1.54 s, sys: 72.5 ms, total: 1.62 s
Wall time: 1.65 s
