[![Baylor Libraries Digital Scholarship Logo](https://cpb-us-w2.wpmucdn.com/blogs.baylor.edu/dist/7/7192/files/2019/08/cropped-DigitalScholarshipblog_header-2019-08-30-1.jpg)](https://bit.ly/baylords)
## Location Entity Recognition using spaCy

#### Non-Default Libraries Used
* spaCy - https://spacy.io/
* Geocoder - https://geocoder.readthedocs.io/index.html
* Folium - https://github.com/python-visualization/folium
* Wordcloud - https://github.com/amueller/word_cloud
* Squarify - https://github.com/laserson/squarify

---
<div style="text-align: right"> Joshua Been, Baylor University Libraries, 11/08/2019 </div>


#### Step 1 - Import Libraries

In [0]:
import spacy, en_core_web_sm, operator, collections, itertools, folium, zipfile
from spacy import displacy
try:
    import geocoder
except:
    !pip install geocoder
    import geocoder
try:
    import squarify
except:
    !pip install squarify
    import squarify
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
nlp = en_core_web_sm.load()
print('\n >> Completed!')

#### Step 2 - Browse for File or URL
#### Files can be .txt or .zip of .txt files
#### URLs must point to a webpage. Will not work with online files.

In [0]:
# Leave url empty to browse for local file
# url should point to webpage, not a file hosted online
url=''

if url!='':
    import requests
    from bs4 import BeautifulSoup

    # define header or pages may refuse connection
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    htmlContent = requests.get(url, headers=header)
    # remove tags using beautiful soup
    contents = BeautifulSoup(htmlContent.text, "lxml").text
    # remove line breaks and tab symbols
    contents=contents.replace('\n',' ').replace('\r','').replace('\t','')
    print(url, 'web content obtained.')
else:
    from google.colab import files

    # Browse/Upload File
    up=files.upload()
    # File passed to variable doc
    doc=next(iter(up))

    if doc.endswith('.txt'):
        # Read a file using the variable doc
        f=open(doc, 'r', encoding='utf-8-sig')
        # Pass into variable named contents
        contents=f.read()
        # close connection to file
        f.close()
    elif doc.endswith('.zip'):
        contents=''
        with zipfile.ZipFile(doc, 'r') as zip_ref:
            zip_ref.extractall()
        for txt in zip_ref.namelist():
            if txt.endswith('.txt'):
                f=open(txt, 'r', encoding='utf-8-sig')
                contents=contents+'\n'+f.read()
                f.close()
    else:
        print('Sorry, unknown file extension. Please try again')

#### Step 4 -Frequencies for All Entities

https://spacy.io/api/annotation#section-named-entities

In [0]:
# This will hold cumulative entities from all sections
full_entities=[]
full_labels=[]
full_locations=[]
start = 0
i = 1
# spaCy can process up to 1,000,000 characters at once, so we increment at this max count
increment = 1000000
while start<len(contents):
    # reassign max increment if greater than number of remaining characters
    if increment>len(contents):
        increment=len(contents)
    print(' >> Section #',i,'(character',start,'-',increment,'/',len(contents),')')
    # Pass batch of characters to nlp processor
    docs = nlp(contents[start:increment])
    # Create list of tuples containing text and entity label
    entities = [(X.text, X.label_) for X in docs.ents]
    # Store each list of tuples in cumulative list
    full_entities=full_entities+entities
    # Create list of labels
    labels = [x.label_ for x in docs.ents]
    # Store each list of labels in cumulative list
    full_labels=full_labels+labels
    print(len(entities), 'entities found')
    print(Counter(labels),'\n______\n')
    start+=1000000
    increment+=1000000
    i+=1
if i>2:
    print('\n >> Cumulative Results','\n',len(full_entities),'entities found\n',Counter(full_labels))

# Cumulative Frequencies of GPE and LOC Locations
print('Cumulative Frequencies of GPE and LOC Locations')
locations=[]
locations_list=[]
for entity in full_entities:
    if entity[1]=='GPE' or entity[1]=='LOC':
        locations.append(entity)
        locations_list.append(entity[0])
locations_d=Counter(locations_list)
display(Counter(locations).most_common())
    
print('\n >> Completed!')

#### Step 5 - View Entities Highlighted In-Line
#### PLEASE NOTE the options line

In [0]:
options={'ents':['GPE','LOC']}
start = 0
i = 1
increment = 1000000
while start<len(contents):
    print(' >> Section #',i,'(character',start,'-',increment,'/',len(contents),')')
    tmp_text = contents[start:increment]
    
    # displaCy is spaCy's primary visualization method
    ############################################
    # To view only location entities, the following line should be:
    # displacy.render(nlp(contents), jupyter=True, style='ent', options=options)
    #
    # To view all entities, the line should be:
    # displacy.render(nlp(contents), jupyter=True, style='ent')
    ############################################
    displacy.render(nlp(contents), jupyter=True, style='ent')
    
    start+=1000000
    increment+=1000000
    if increment>len(contents):
        increment=len(contents)
    i+=1

#### Step 6 - Visualizations: Map, Wordcloud, Bar chart, Treemap

#### Required: Bing Maps API key (free)

* Go to http://www.bingmapsportal.com/, and sign in with your Microsoft account (or create a new account).
* Under My Account, select My Keys.
* Click to create a new key.
* Enter your name or company as the Application name, and select "Basic" as your key type and "Dev/Test" as the application type.
* Your new key will appear at the bottom of the "Create or view keys" page; it will be 64 characters long and will probably begin with "A".

In [0]:
# Specify Options Below
#######################
## ALL VISUALIZATION OPTIONS
min_location_counts_map = 1 # minimum frequency for inclusion
## MAP OPTIONS
bkey=''
map_base_type = 'Open Street Map' #options: Stamen Toner, Stamen Terrain, Stamen Watercolor, Open Street Map 
zoom_default=4
start_location=[31.51073, -96.4247]  # U.S. 31.51073, -96.4247, World 0,0, Texas 31.1351682, -99.3350552
geocode_confidence=['High'] # options: High, Medium, Low - Format: ['High','Medium','Low]
radius_multiplier=1
outline_color='#154734'
fill_polygon_color='#154734'
## BAR CHART, TREEMAP OPTIONS
Number_Top_Locations=20
#######################

filtered_locations={}

if bkey!='':

  m = folium.Map(
      location=start_location,
      zoom_start=zoom_default,
      tiles=map_base_type
  )

  for key, value in locations_d.items():
      try:
          if value>=min_location_counts_map:
              g = geocoder.bing(key, key=bkey)
              if g.raw['confidence'] in geocode_confidence:
                  filtered_locations[key]=value
                  folium.CircleMarker(
                      location=[g.lat, g.lng],
                          radius=value*radius_multiplier,
                          popup=key+' '+str(value),
                          tooltip=key+' '+str(value),
                          color=outline_color,
                          fill=True,
                          fill_color=fill_polygon_color
                      ).add_to(m)
      except:
          pass

  display(m)
  m.save('index.html')

else:
    filtered_locations=locations_d

# Sort List
sorted_x = sorted(filtered_locations.items(), key=operator.itemgetter(1), reverse=True)
filtered_locations=collections.OrderedDict(sorted_x)

# Wordcloud
print('\n\n')
wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=filtered_locations)

# Display the generated image:
plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.tight_layout(pad=0)
plt.axis("off")

# Bar Chart
print('\n\n')
# Slice for top n locations
top_locations=dict(itertools.islice(filtered_locations.items(), Number_Top_Locations))

# double-check to ensure sorted
sorted_x = sorted(top_locations.items(), key=operator.itemgetter(1), reverse=True)
top_locations=collections.OrderedDict(sorted_x)

plt.figure(num=None, figsize=(8, 6), dpi=150, facecolor='w', edgecolor='k')
plt.bar(range(len(top_locations)), list(top_locations.values()), align='center')
plt.xticks(range(len(top_locations)), list(top_locations.keys()), rotation=75)

plt.show()

# Treemap
print('\n\n')
squarify.plot(sizes=list(top_locations.values()), label=list(top_locations.keys()), alpha=1, color=['red','orange','yellow','green','blue','indigo','violet'] )

fig = plt.gcf()
fig.set_size_inches(20, 8)
plt.axis('off')
plt.show() 