<a href="https://colab.research.google.com/github/JWackerow/Named-Entity-Recognition/blob/main/Named_Entity_Recognition_With_spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
%%capture
import numpy as np
import pandas as pd

import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy import displacy
from spacy.tokens import Span

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from gensim.utils import simple_preprocess

# library to clean up spelling variations in named entities
#!pip install fuzzywuzzy
#from fuzzywuzzy import process, fuzz

import pprint

from google.colab import drive

In [2]:
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [5]:
data = pd.read_csv(r'/content/gdrive/My Drive/Datasets/Tweets.csv')

In [7]:
data.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [11]:
nlp = spacy.load("en_core_web_sm")

# Drop na if they exist and pull out tweet text
texts = data.loc[data['text'].isna() == False, 'text']

# Preprocess texts using gensim
#tweets = [' '.join(simple_preprocess(doc, deacc=True, )) for doc in tweets]

# Create list of documents
docs = [nlp(doc.lower()) for doc in texts]

# Create a list of documents that contain a named entity
docs_with_ents = list(filter(lambda x: x.ents, docs))

# Print total number of docs and 
print(len(docs), len(docs_with_ents))

In [15]:
def show_ents(doc):
  if doc.ents: 
    for ent in doc.ents: print(ent.text+' - ' +str(ent.start_char) +' - '+ str(ent.end_char) +' - '+ent.label_+ ' - '+str(spacy.explain(ent.label_))) 
  else: print('No named entities found.')

In [28]:
# Print and display named entities in one of the documents
sample_doc = docs_with_ents[11]
show_ents(sample_doc)

displacy.render(sample_doc, style='ent', jupyter=True)

@virginamerica - 0 - 14 - ORG - Companies, agencies, institutions, etc.
2nd - 68 - 71 - ORDINAL - "first", "second", etc.
@australia &amp - 80 - 95 - ORG - Companies, agencies, institutions, etc.
1st - 123 - 126 - ORDINAL - "first", "second", etc.


In [37]:
# Print inside, outside, beginning status of each token
for token in sample_doc:
  print(token.text, token.ent_iob_, token.ent_type_, spacy.explain(token.ent_type_))

@virginamerica B ORG Companies, agencies, institutions, etc.
this O  None
is O  None
such O  None
a O  None
great O  None
deal O  None
! O  None
already O  None
thinking O  None
about O  None
my O  None
2nd B ORDINAL "first", "second", etc.
trip O  None
to O  None
@australia B ORG Companies, agencies, institutions, etc.
& I ORG Companies, agencies, institutions, etc.
amp I ORG Companies, agencies, institutions, etc.
; O  None
i O  None
have O  None
n't O  None
even O  None
gone O  None
on O  None
my O  None
1st B ORDINAL "first", "second", etc.
trip O  None
yet O  None
! O  None
; O  None
p O  None


In [107]:
# Get sentment scores for each document a named entity appears in
def entity_sentiment(docs, ent_labels_to_analyze=["ORG"]):
  '''
  Calculates the mean sentiment score and frequency count for each named entity. 

  Parameters
  ----------
  docs : list
    List of spaCy documents.
  
  ent_labels_to_analyze : list
    List of named entity labels to analyze.

  Returns
  -------
  dict
    A dictionary with named entity text as keys and the frequency count and mean sentiment score as values.

  '''
  vader = SentimentIntensityAnalyzer()
  ent_sentiment = dict()
  for doc in docs_with_ents:
    for ent in doc.ents:
      if ent.label_ in ent_labels_to_analyze:
        if ent.text in ent_sentiment:
          ent_sentiment[ent.text][0] += 1
          ent_sentiment[ent.text][1].append(vader.polarity_scores(doc.text)['compound'])
        else:
          ent_sentiment[ent.text] = [1, [vader.polarity_scores(doc.text)['compound']]]
  for k, v in ent_sentiment.items():
    ent_sentiment[k][1] = np.mean(ent_sentiment[k][1])
  return ent_sentiment

In [108]:
def show_sentiment_scores(scores, sort_by='score', min_count=5, max_shown=30):
  '''
  Prints entities, their frequency count, and their mean sentiment scores.

  Parameters
  ----------
  scores : dict
    Dictionary containing named entities and the keys and counts and mean sentiment scores as values.
  
  sort_by : str
    Value to sort by. Can either be 'count' or 'score'.
  
  min_count : int
    Minimum number of times an entity must appear in the tweets to be included.

  max_shown : int
    Maximum number of entities to be shown.

  '''
  sort_index = {'count':0, 'score':1}[sort_by]
  sentiment_scores = sorted(scores.items(), key=lambda x: x[1][sort_index], reverse=True)
  sentiment_scores = list(filter(lambda x: x[1][0] > min_count, sentiment_scores))
  max_shown = np.min([len(sentiment_scores), max_shown])
  pprint.pprint(sentiment_scores[:max_shown])

In [97]:
# Get the sentiment for named entities in tweets (ORGs)
named_entity_sentiment = entity_sentiment(docs_with_ents)

In [102]:
# Print list of entities with top sentiment scores
show_sentiment_scores(named_entity_sentiment)

[('☺', [8, 0.6623249999999999]),
 ('@southwestair @fortunemagazine', [11, 0.6298727272727273]),
 ('united airlines', [13, 0.5018923076923076]),
 ('@virginamerica @ladygaga @carrieunderwood', [6, 0.49804999999999994]),
 ('united', [29, 0.4650275862068964]),
 ('pdx', [13, 0.37922307692307694]),
 ('mci', [6, 0.3641833333333333]),
 ('ios', [9, 0.28865555555555555]),
 ('aus', [7, 0.2809714285714286]),
 ('iad', [25, 0.22350799999999998]),
 ('&amp', [34, 0.2085558823529412]),
 ('@virginamerica', [452, 0.1597466814159292]),
 ('cont', [6, 0.15163333333333331]),
 ('@southwestair', [1899, 0.14443712480252763]),
 ('@delta', [56, 0.13226428571428572]),
 ('nyc', [7, 0.11657142857142858]),
 ('app', [17, 0.11309411764705883]),
 ('@jetblue airways', [14, 0.1015642857142857]),
 ('american airlines', [10, 0.08681000000000001]),
 ('delta', [7, 0.0775]),
 ('flight &amp', [7, 0.051328571428571425]),
 ('mia', [8, 0.0442375]),
 ('@jetblue', [549, 0.043553005464480876]),
 ('time &amp', [6, 0.032116666666666675