## Named Entity Extraction from a Corpus of Born Digital Archival Content

A factory that takes UTF-8 texts extracted from Word processing documents found in a personal papers archives collection and generates some NERs

In [None]:
import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


import re #regex module

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

import spacy
from spacy import displacy

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')

from gensim import matutils, corpora
from gensim.models import Word2Vec
from gensim.corpora import MmCorpus, dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from gensim.models import KeyedVectors

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from collections import Counter

#nlp = en_core_web_sm.load()
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 5000000 #spacy default has 1M character limit

pd.options.display.float_format = '{:,.8f}'.format

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


## Set up workspaces and clean data

In [None]:
corpus_root = '/path/to/utf-8/textfiles/'

In [None]:
from collections import defaultdict
from pathlib import Path

In [None]:
# create a corpus by walking through a directory of files, open each file and read filename and text into a 2 column dataframe
corpus = defaultdict(list)
for file in Path(corpus_root).iterdir():
    with open(file, "r") as file_open:
        corpus["file_name"].append(file.name)
        corpus["text"].append(file_open.read())
df = pd.DataFrame(corpus)


print(df)
#print(corpus['file_name'])

In [None]:
#trim empty text rows 
nan_value = float("NaN")

df.replace("", nan_value, inplace=True)

df = df.replace('\n',' ', regex=True)

df.dropna(subset = ["text"], inplace=True)

print(df)

In [None]:
#turn text files into a string and count the number of characters into a new column
df['charcount'] = df['text'].str.len()

#sum the new column and put it into a variable
total = df['charcount'].sum()

#print the variable
print(total)

## Tokenization factory and extract entities

In [None]:
#create a variable called 'tokens' that contains all the words in the 'text' column of the df dataframe

tokens = nlp(''.join(str(df.text.tolist())))

In [None]:
#extract most common token
items = [x.text for x in tokens.ents]
Counter(items).most_common(20)


## Visualize entities with spaCY/displacy test

In [None]:
displacy.render(tokens, style="ent")

In [None]:
options = {"ents": ["PERSON"]}
displacy.render(tokens, style="ent", options=options)

## NER People plot

In [None]:
person_list = []
for ent in tokens.ents:
    if ent.label_ == 'PERSON':
        person_list.append(ent.text)
        
person_counts = Counter(person_list).most_common(20)
df_person = pd.DataFrame(person_counts, columns =['text', 'count'])
#df_person.head()

In [None]:
df_person.plot.barh(x='text', y='count', title="Top 20 Personal Names", figsize=(10,8)).invert_yaxis()

## NER Organization Names plot

In [None]:
org_list = []
for ent in tokens.ents:
    if ent.label_ == 'ORG':
        org_list.append(ent.text)
        
org_counts = Counter(org_list).most_common(20)
df_org = pd.DataFrame(org_counts, columns =['text', 'count'])
#df_org.head()

In [None]:
df_org.plot.barh(x='text', y='count', title="Top 20 Organizational Names", figsize=(10,8)).invert_yaxis()

## NER Geographic Names plot

In [None]:
geog_list = []
for ent in tokens.ents:
    if ent.label_ == 'GPE':
        geog_list.append(ent.text)
        
geog_counts = Counter(geog_list).most_common(20)
df_geog = pd.DataFrame(geog_counts, columns =['text', 'count'])
#df_geog.head()

In [None]:
df_geog.plot.barh(x='text', y='count', title="Top 20 Geographic Terms", figsize=(10,8)).invert_yaxis()

## NER Products plot

In [None]:
product_list = []
for ent in tokens.ents:
    if ent.label_ == 'PRODUCT':
        product_list.append(ent.text)
        
product_counts = Counter(product_list).most_common(20)
df_product = pd.DataFrame(product_counts, columns =['text', 'count'])
#df_product.head()

In [None]:
df_product.plot.barh(x='text', y='count', title="Top 20 Products", figsize=(10,8)).invert_yaxis()

## NER Noun/pronoun plot

In [None]:
noun_list = []

for x in tokens :
    if x.pos_ == "NOUN" or x.pos_ == "PROPN":
        noun_list.append(x.text)

noun_counts = Counter(noun_list).most_common(20)
df_noun = pd.DataFrame(noun_counts, columns =['text', 'count'])
#df_noun.head()

In [None]:
df_noun.plot.barh(x='text', y='count', title="Top 20 Nouns and Proper Nouns", figsize=(10,8)).invert_yaxis()

## NER Dates plot

In [None]:
dates_list = []
for ent in tokens.ents:
    if ent.label_ == 'DATE':
        dates_list.append(ent.text)
        
dates_counts = Counter(dates_list).most_common(20)
df_dates = pd.DataFrame(dates_counts, columns =['text', 'count'])

In [None]:
df_dates.plot.barh(x='text', y='count', title="Top 20 Dates", figsize=(10,8)).invert_yaxis()

## Counting and identifying ents, trying to get them all into a list so we can see which ones are worth plotting
Move this up to EDA eventually

In [None]:
for ent in tokens.ents:
    print(ent.label_)

In [None]:
for entity in tokens.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

In [None]:
len([ent for ent in tokens.ents if ent.label_=='PERSON'])

In [None]:
len([ent for ent in tokens.ents if ent.label_=='GPE'])

In [None]:
len([ent for ent in tokens.ents if ent.label_=='ORG'])

In [None]:
len([ent for ent in tokens.ents if ent.label_=='DATE'])

In [None]:
for ent in tokens.ents:
    table = 
    people = len([ent for ent in tokens.ents if ent.label_=='PERSON'])
    org = len([ent for ent in tokens.ents if ent.label_=='ORG'])
    gpe = len([ent for ent in tokens.ents if ent.label_=='GPE'])
    date = len([ent for ent in tokens.ents if ent.label_=='DATE'])
    

