## Location Entity Recognition using spaCy

#### Non-Default Libraries Used
* spaCy - https://spacy.io/
* Wordcloud - https://github.com/amueller/word_cloud
* Geocoder - https://geocoder.readthedocs.io/index.html
* Folium - https://github.com/python-visualization/folium

---
<div style="text-align: right"> Joshua Been 05/10/2019 </div>
<div style="text-align: right"> version 3 </div>

#### Step 1 - Import Libraries

In [0]:
import spacy, en_core_web_sm, operator, folium, zipfile
from spacy import displacy
try:
    import geocoder
except:
    !pip install geocoder
    import geocoder
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
nlp = en_core_web_sm.load()
print('\n >> Completed!')

#### Step 2 - Browse for File
##### Ensure file is in project directory
##### Ensure file is saved as UTF (ANSI on Windows)

In [0]:
from google.colab import files

# Browse/Upload File
up=files.upload()
# File passed to variable doc
doc=next(iter(up))

if doc.endswith('.txt'):
    # Read a file using the variable doc
    f=open(doc, 'r', encoding='utf-8-sig')
    # Pass into variable named contents
    contents=f.read()
    # close connection to file
    f.close()
elif doc.endswith('.zip'):
    contents=''
    with zipfile.ZipFile(doc, 'r') as zip_ref:
        zip_ref.extractall()
    for txt in zip_ref.namelist():
        if txt.endswith('.txt'):
            f=open(txt, 'r', encoding='utf-8-sig')
            contents=contents+'\n'+f.read()
            f.close()
else:
    print('Sorry, unknown file extension. Please try again')

#### Step 4 -Frequencies for All Entities

https://spacy.io/api/annotation#section-named-entities

In [0]:
# This will hold cumulative entities from all sections
full_entities=[]
full_labels=[]
full_locations=[]
start = 0
i = 1
# spaCy can process up to 1,000,000 characters at once, so we increment at this max count
increment = 1000000
while start<len(contents):
    # reassign max increment if greater than number of remaining characters
    if increment>len(contents):
        increment=len(contents)
    print(' >> Section #',i,'(character',start,'-',increment,'/',len(contents),')')
    # Pass batch of characters to nlp processor
    docs = nlp(contents[start:increment])
    # Create list of tuples containing text and entity label
    entities = [(X.text, X.label_) for X in docs.ents]
    # Store each list of tuples in cumulative list
    full_entities=full_entities+entities
    # Create list of labels
    labels = [x.label_ for x in docs.ents]
    # Store each list of labels in cumulative list
    full_labels=full_labels+labels
    print(len(entities), 'entities found')
    print(Counter(labels),'\n______\n')
    start+=1000000
    increment+=1000000
    i+=1
if i>2:
    print('\n >> Cumulative Results','\n',len(full_entities),'entities found\n',Counter(full_labels))

# Cumulative Frequencies of GPE and LOC Locations
print('Cumulative Frequencies of GPE and LOC Locations')
locations=[]
locations_list=[]
for entity in full_entities:
    if entity[1]=='GPE' or entity[1]=='LOC':
        locations.append(entity)
        locations_list.append(entity[0])
locations_d=Counter(locations_list)
display(Counter(locations).most_common())
    
print('\n >> Completed!')

#### Step 5 - View Entities Highlighted In-Line
#### PLEASE NOTE the options line

In [0]:
options={'ents':['GPE','LOC']}
start = 0
i = 1
increment = 1000000
while start<len(contents):
    print(' >> Section #',i,'(character',start,'-',increment,'/',len(contents),')')
    tmp_text = contents[start:increment]
    
    # displaCy is spaCy's primary visualization method
    ############################################
    # To view only location entities, the following line should be:
    # displacy.render(nlp(contents), jupyter=True, style='ent', options=options)
    #
    # To view all entities, the line should be:
    # displacy.render(nlp(contents), jupyter=True, style='ent')
    ############################################
    displacy.render(nlp(contents), jupyter=True, style='ent')
    
    start+=1000000
    increment+=1000000
    if increment>len(contents):
        increment=len(contents)
    i+=1

#### Step 6 - Visualizations: Map, Wordcloud, Bar chart

#### Required: Bing Maps API key (free)

* Go to http://www.bingmapsportal.com/, and sign in with your Microsoft account (or create a new account).
* Under My Account, select My Keys.
* Click to create a new key.
* Enter your name or company as the Application name, and select "Basic" as your key type and "Dev/Test" as the application type.
* Your new key will appear at the bottom of the "Create or view keys" page; it will be 64 characters long and will probably begin with "A".

In [0]:
# Enter Bing Maps Key Below
bkey=''

if bkey!='':

  min_location_counts_map = 10

  m = folium.Map(
      location=[27.4, -40.3],
      zoom_start=2,
      tiles='Stamen Toner'
  )


  for key, value in locations_d.items():
      try:
          if value>=min_location_counts_map:     # minimum frequency for mapping
              g = geocoder.bing(key, key=bkey)
              if g.raw['confidence']=='High':   # confidence is high, medium, low
                  folium.CircleMarker(
                      location=[g.lat, g.lng],
                          radius=value/5,     # Can decrease or increase sizes by /n or *n
                          popup=key,
                          tooltip=key,
                          color='#154734',
                          fill=True,
                          fill_color='#154734'
                      ).add_to(m)
      except:
          pass

  display(m)
  m.save('index.html')


# Wordcloud
# Collocations set to False disables two-word frequencies
wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies=locations_d)

# Display the generated image:
plt.figure( figsize=(20,10), facecolor='k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.tight_layout(pad=0)
plt.axis("off")



# Bar Chart
# Adjust number as needed
Number_Top_Locations=10

# Sort by frequency into tuple
locations_d=dict((x, y) for x, y in Counter(locations_list).most_common())

# Convert top tuple entries back to dictionary
top_locations = {k: locations_d[k] for k in list(locations_d)[:Number_Top_Locations]}

plt.figure(num=None, figsize=(8, 6), dpi=150, facecolor='w', edgecolor='k')
plt.bar(range(len(top_locations)), list(top_locations.values()), align='center')
plt.xticks(range(len(top_locations)), list(top_locations.keys()), rotation=75)

plt.show()