In [None]:
! pip install transformers

In [None]:
# install Spacy and a language model

!pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm


# IF YOU ARE USING A NEW APPLE (M1 CPU) COMPUTER use these lines instead.
# pip install -U pip setuptools wheel
# pip install -U 'spacy[apple]'
# python -m spacy download en_core_web_sm

# SC207 Text Mining
## Sentiment Analysis and Entity Recognition
### Using Pre-Trained Models for quick text insights

The two methods we're using today rely on pre-trained models to quickly pull apart and analyse pieces of text with a high degree of complexity. Trained on millions of examples of text from the internet, archives, books etc. These models go beyond simply looking at what words are being used, and consider the placement of words, their immediate and distant context, their role within sentence structure and more, to make inferences about what the text says, what matters in the text, and what it could imply.

#### Tools
Today we're using two packages.
- [Transformers](https://pypi.org/project/transformers/): Allows us to quickly download a pre-trained model designed specifically for sentiment analysis using the HuggingFace 🤗 [AI model repository](https://huggingface.co/)
- [SpaCy](https://spacy.io/): A natural language processing package that relies on its own pre-trained models to provide a large set of text analysis features. Today we'll be using itrs powerful entity recognition system.

In [None]:
import pandas as pd
from transformers import pipeline
import spacy
import seaborn as sns

# Sentiment Analysis

A tricky area to get right. Prior to pre-trained models sentiment was determined by matching specific words to a predefined table that gave each word a score depending on how positive/negative the designers felt the word was. Whilst this worked for simple text, sentiment is often context dependent, can be morphed by sarcasm, and changes over time as lagnuage evolves. Regularly updated models are shown examples of text that have been labelled as either positive or negative by human annotators, and tested to see if they can accurately predict what a human would label a brand new piece of text.

We can initialise one of those super complex incredibly difficult to build models, but it will take every bit of our coding skills to do so...

In [None]:
get_sentiment = pipeline('sentiment-analysis')

Done.

In [None]:
get_sentiment("I love every brilliant thing right now. Super happy")

In [None]:
get_sentiment("My name is James")

In [None]:
get_sentiment("I am very angry")

It is worth noting two things...
1. Text can ONLY be positive or negative under this model, there is no neutral.
2. The score does not indicate strength of sentiment. It indicates how confident the model is in its prediction. We'll look at this more later.

## Applying it to a whole dataset

In [None]:
def flatten_nested_dicts(text_data):
    dicts = text_data.to_dict(orient='records')
    flattened = pd.json_normalize(dicts)
    return flattened

We will use our twitter dataset and also our community assignments we generated using NetworkX. We can use these later to examine whether sentiment differs between different groups in our retweet network.

In [None]:
tweets = pd.read_pickle('example_twitter_data.pkl')

In [None]:
tweets = tweets[tweets['retweeted_status'].isna()] # remove retweets
tweets = flatten_nested_dicts(tweets)

communities = pd.read_csv('communities.csv', index_col=0)
communities.head()

In [None]:
# Merging together tweet data and community assignments
tweets = tweets.merge(communities,how='left', left_on='user.screen_name', right_index=True).dropna(subset='community')
tweets.shape

In [None]:
sample = tweets.sample(500).copy().reset_index()
sample = sample[['full_text','community']]

In [None]:
sample['sentiment'] = get_sentiment(sample['full_text'].tolist())
sample

In [None]:
label_score = pd.json_normalize(sample['sentiment'])
sample = pd.concat([sample,label_score], axis=1)

sample['community'] = sample['community'].astype(int)

sample.head()

## Visuals 
### Distribution of Sentiment

In [None]:
sample.groupby('label').count()

In [None]:
sns.catplot(data=sample, x='label', kind='count')

In [None]:
sample.groupby(['community','label']).count()

In [None]:
sns.catplot(data=sample, y='community',hue='label', kind='count')

In [None]:
order = sample['community'].value_counts().index
sns.catplot(data=sample, y='community',hue='label', kind='count',order=order).set(title='Sentiment of Tweets ordered by community tweet freq')


### Score Confidence

In [None]:
sample.groupby('community')['score'].describe()

In [None]:
sns.catplot(data=sample, x='community', y='score',kind='box', hue='label', aspect=2,order=order)

In [None]:
confidence_data = sample.groupby(['community','label']).mean().unstack()
confidence_data

In [None]:
sns.heatmap(data=confidence_data, annot=True, linewidths=0.3,  cmap='coolwarm')

# Named Entity Recognition

Named entity recognition (NER) is the technique of extracting key entities within a piece of text,
- people
- places
- organisations
- dates
- values
- currencies etc.

SpaCy's processing examines each word in context and uses this to predict which tokens likely refer to particular types of entities like people, organisations, dates etc. It is not using any limited list or reference to "look up" these entities, but instead identifies them based on contextual cues.


In [None]:
text_data = pd.read_csv('sample_news_large.csv')

In [None]:
text_data.head()

In [None]:
nlp = spacy.load('en_core_web_md')

In [None]:
trump = nlp("""A New York judge has ordered President Donald Trump to pay $2m (£1.6m)"""\
            """ for misusing funds from his charity to finance his 2016 political campaign."""\
            """ The Donald J Trump Foundation closed down in 2018. Prosecutors had accused it"""\
            """ of working as "little more than a chequebook" for Mr Trump's interests."""\
            """ Charities such as the one Mr Trump and his three eldest children headed cannot"""\
            """ engage in politics, the judge ruled.""")

# Source: https://www.bbc.co.uk/news/world-us-canada-50338231

In [None]:
# we can access the entities with the .ents attribute
trump.ents

In [None]:
# every object in the entities list has a text attribute and a label attribute to tell you the type of entity it is.

for entity in trump.ents:
    print(entity.text, entity.label_)

In [None]:
# as we're in Jupyter we can also use SpaCy's built in visualiser

spacy.displacy.render(trump,style='ent', jupyter=True)

In [None]:
# if you want to save the annotated version of the
# text you can save to html using this function.

def save_displacy_to_html(doc, filename, style='ent'):
    html_data = spacy.displacy.render(doc, style='ent', jupyter=False, page=True)
    with open(filename, 'w+', encoding="utf-8") as f:
        f.write(html_data)

save_displacy_to_html(trump, 'test.html', style='ent')

In [None]:
# lets create a function that can extract specific types of entities from a text

def entity_extractor(nlp_doc, entity_type=None, count_all=True):
    if entity_type is None:
        ents = [(ent.text,ent.label_) for ent in nlp_doc.ents]
    else:
        ents = [ent.text for ent in nlp_doc.ents if ent.label_ == entity_type.upper()]
    if not count_all:
        ents = list(set(ents))
    return ents

In [None]:
entity_extractor(trump)

In [None]:
entity_extractor(trump, 'person')

In [None]:
docs = nlp.pipe(text_data['text'])
people = [entity_extractor(doc,'person') for doc in docs]

In [None]:
text_data['people'] = people
text_data['people']

In [None]:
people_data = text_data.explode('people')[['query','people','title']]
people_data


In [None]:
# most mentioned people
people_data['people'].value_counts()[:10]

In [None]:
# top ten people per group
for query,data in people_data.groupby('query'):
    print(f"****{query}****")
    print(data['people'].value_counts()[:10])
    print()

In [None]:
top_people = people_data.groupby('people',as_index=False).count().nlargest(5,'query')
top_people


In [None]:
sns.catplot(data=top_people, y='people',x='query', kind='bar',height=5, aspect=2).set(xlabel='Freq', ylabel='Person', title='5 Most Mentioned People')

In [None]:

for query,data in people_data.groupby('query'):
    top_for_query = data.groupby('people', as_index=False).count().nlargest(5,'title')
    sns.catplot(data=top_for_query,x='title',y='people', kind='bar', aspect=2).set(title=f'{query.title()}: Top 5 People',
                                                                         xlabel='freq',
                                                                         ylabel='Person')