In [1]:
import pandas as pd
import numpy

# Sentiment analysis

- Preprocessing
    - remove URLs
    - remove Stopwords
    - Lemmatization
- Sentiment Analysis with nltk

In [2]:
clean_vaccine_tweets = pd.read_csv("../data/interim/cleaned_vaccine_tweets.csv", index_col=0)

In [3]:
clean_vaccine_tweets.head()

Unnamed: 0,id,created_at,user,geo,full_text,hashtags,user_id,PfizerBiontech,SputnikV,Sinopharm,Sinovac,Moderna,AstraZeneca,Covaxin,JandJ,user_location,coordinates
0,1338158543359250432,2020-12-13 16:27:13+00:00,"{'id': 76052772, 'id_str': '76052772', 'name':...",,While the world has been on the wrong side of ...,"['covid19', 'supplychain', 'logistics', 'vacci...",76052772,1,0,0,0,0,0,0,0,,
1,1337840331522453504,2020-12-12 19:22:45+00:00,"{'id': 1300382181605494800, 'id_str': '1300382...",,@cnnbrk #COVID19 #CovidVaccine #vaccine #Coron...,"['covid19', 'covidvaccine', 'vaccine', 'corona...",1300382181605494800,1,0,0,0,0,0,0,0,,
2,1338544403795881984,2020-12-14 18:00:29+00:00,"{'id': 1164717209253552000, 'id_str': '1164717...",,The FDA Authorizes Emergency Use Of The Pfizer...,"['pfe', 'pfizer', 'pfizervaccine', 'pfizerbion...",1164717209253552000,1,0,0,0,0,0,0,0,,
3,1337735595704115200,2020-12-12 12:26:34+00:00,"{'id': 1316036067754205200, 'id_str': '1316036...",,The #FDA finally issues #EUA now comes the pro...,"['fda', 'eua', 'pfizerbiontech', 'vaccinated']",1316036067754205200,1,0,0,0,0,0,0,0,,
4,1337850832256176128,2020-12-12 20:04:29+00:00,"{'id': 1110032180237852700, 'id_str': '1110032...",,There have not been many bright days in 2020 b...,"['bidenharris', 'election2020', 'pfizerbiontec...",1110032180237852700,1,0,0,0,0,0,0,0,,


---

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer

# NLP
## Preprocessing

In [5]:
clean_vaccine_tweets["corpus"] = ""

In [6]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to /Users/ayman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ayman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ayman/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

Set lowercase, remove punctuation, remove url:
- "no","nor","not" removed from stopwords because they may be relevant for sentiment
- add "vaccine" to stopwords because it has little information value

In [7]:
def clean_dataset(dataset):
    for i in range(0, len(dataset)):
        #Tokenize and set words to lowercase 
        review = dataset["full_text"][i]
        review = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",review).split())
        review = re.sub("[^a-zA-Z]", " ", review)
        review = review.lower()
        review = review.split()

        #stopwords: 
        all_stopwords = [word for word in stopwords.words("english") if word not in ["no","nor","not"]]
        all_stopwords.extend(["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","&amp","amp","amp;","&amp;","nhs","vaccine","covidvaccine"])
        #lemmatization:
        lemma = nltk.wordnet.WordNetLemmatizer()
        review = " ".join([lemma.lemmatize(word) for word in review if word not in set(all_stopwords)])    
        dataset["corpus"][i] = review


In [8]:
clean_dataset(clean_vaccine_tweets)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["corpus"][i] = review


In [9]:
clean_vaccine_tweets.head()

Unnamed: 0,id,created_at,user,geo,full_text,hashtags,user_id,PfizerBiontech,SputnikV,Sinopharm,Sinovac,Moderna,AstraZeneca,Covaxin,JandJ,user_location,coordinates,corpus
0,1338158543359250432,2020-12-13 16:27:13+00:00,"{'id': 76052772, 'id_str': '76052772', 'name':...",,While the world has been on the wrong side of ...,"['covid19', 'supplychain', 'logistics', 'vacci...",76052772,1,0,0,0,0,0,0,0,,,world wrong side history year hopefully bigges...
1,1337840331522453504,2020-12-12 19:22:45+00:00,"{'id': 1300382181605494800, 'id_str': '1300382...",,@cnnbrk #COVID19 #CovidVaccine #vaccine #Coron...,"['covid19', 'covidvaccine', 'vaccine', 'corona...",1300382181605494800,1,0,0,0,0,0,0,0,,,covid corona pfizerbiontech bbcnews nytimes bb...
2,1338544403795881984,2020-12-14 18:00:29+00:00,"{'id': 1164717209253552000, 'id_str': '1164717...",,The FDA Authorizes Emergency Use Of The Pfizer...,"['pfe', 'pfizer', 'pfizervaccine', 'pfizerbion...",1164717209253552000,1,0,0,0,0,0,0,0,,,fda authorizes emergency use pfizer pfe pfizer...
3,1337735595704115200,2020-12-12 12:26:34+00:00,"{'id': 1316036067754205200, 'id_str': '1316036...",,The #FDA finally issues #EUA now comes the pro...,"['fda', 'eua', 'pfizerbiontech', 'vaccinated']",1316036067754205200,1,0,0,0,0,0,0,0,,,fda finally issue eua come problem transportin...
4,1337850832256176128,2020-12-12 20:04:29+00:00,"{'id': 1110032180237852700, 'id_str': '1110032...",,There have not been many bright days in 2020 b...,"['bidenharris', 'election2020', 'pfizerbiontec...",1110032180237852700,1,0,0,0,0,0,0,0,,,not many bright day best bidenharris winning e...


## Sentiment Analysis with nltk

In [10]:
clean_vaccine_tweets["sentiment"] = dict
clean_vaccine_tweets["sentiment_compound"] = 0.0

Determine sentiment via NLTK.Sentiment:
- Values range from [-1, 1]
- -1 is negative, 0 is neutral, 1 is positive

In [11]:
def sentiment_score(dataset):
    sia = SentimentIntensityAnalyzer()
    for i in range(len(dataset)):
        dataset["sentiment"][i] = sia.polarity_scores(dataset["corpus"][i])
        dataset["sentiment_compound"][i] = dataset["sentiment"][i]["compound"]

In [12]:
sentiment_score(clean_vaccine_tweets)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["sentiment"][i] = sia.polarity_scores(dataset["corpus"][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["sentiment_compound"][i] = dataset["sentiment"][i]["compound"]


In [13]:
clean_vaccine_tweets

Unnamed: 0,id,created_at,user,geo,full_text,hashtags,user_id,PfizerBiontech,SputnikV,Sinopharm,Sinovac,Moderna,AstraZeneca,Covaxin,JandJ,user_location,coordinates,corpus,sentiment,sentiment_compound
0,1338158543359250432,2020-12-13 16:27:13+00:00,"{'id': 76052772, 'id_str': '76052772', 'name':...",,While the world has been on the wrong side of ...,"['covid19', 'supplychain', 'logistics', 'vacci...",76052772,1,0,0,0,0,0,0,0,,,world wrong side history year hopefully bigges...,"{'neg': 0.125, 'neu': 0.766, 'pos': 0.109, 'co...",-0.1027
1,1337840331522453504,2020-12-12 19:22:45+00:00,"{'id': 1300382181605494800, 'id_str': '1300382...",,@cnnbrk #COVID19 #CovidVaccine #vaccine #Coron...,"['covid19', 'covidvaccine', 'vaccine', 'corona...",1300382181605494800,1,0,0,0,0,0,0,0,,,covid corona pfizerbiontech bbcnews nytimes bb...,"{'neg': 0.117, 'neu': 0.405, 'pos': 0.477, 'co...",0.8402
2,1338544403795881984,2020-12-14 18:00:29+00:00,"{'id': 1164717209253552000, 'id_str': '1164717...",,The FDA Authorizes Emergency Use Of The Pfizer...,"['pfe', 'pfizer', 'pfizervaccine', 'pfizerbion...",1164717209253552000,1,0,0,0,0,0,0,0,,,fda authorizes emergency use pfizer pfe pfizer...,"{'neg': 0.126, 'neu': 0.874, 'pos': 0.0, 'comp...",-0.3818
3,1337735595704115200,2020-12-12 12:26:34+00:00,"{'id': 1316036067754205200, 'id_str': '1316036...",,The #FDA finally issues #EUA now comes the pro...,"['fda', 'eua', 'pfizerbiontech', 'vaccinated']",1316036067754205200,1,0,0,0,0,0,0,0,,,fda finally issue eua come problem transportin...,"{'neg': 0.137, 'neu': 0.863, 'pos': 0.0, 'comp...",-0.4019
4,1337850832256176128,2020-12-12 20:04:29+00:00,"{'id': 1110032180237852700, 'id_str': '1110032...",,There have not been many bright days in 2020 b...,"['bidenharris', 'election2020', 'pfizerbiontec...",1110032180237852700,1,0,0,0,0,0,0,0,,,not many bright day best bidenharris winning e...,"{'neg': 0.096, 'neu': 0.6, 'pos': 0.304, 'comp...",0.7347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118267,1407688257177935872,2021-06-23 13:13:28+00:00,"{'id': 1263779139397382100, 'id_str': '1263779...",,#SputnikV Paid #Hyderabad https://t.co/oklatcuWLh,"['sputnikv', 'hyderabad']",1263779139397382100,0,1,0,0,0,0,0,0,India,"[22.3511148, 78.6677428]",sputnikv paid hyderabad,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000
118268,1407699323035557888,2021-06-23 13:57:27+00:00,"{'id': 40623001, 'id_str': '40623001', 'name':...",,The @WHO said its review of how #Russia produc...,"['russia', 'sputnikv', 'coronavirus']",40623001,0,1,0,0,0,0,0,0,,,said review russia produce sputnikv coronaviru...,"{'neg': 0.0, 'neu': 0.877, 'pos': 0.123, 'comp...",0.4215
118269,1407699835856330752,2021-06-23 13:59:29+00:00,"{'id': 61611674, 'id_str': '61611674', 'name':...",,#WHO Finds Production Infringements at #Sputni...,"['who', 'sputnikv', 'russia', 'covid19', 'coro...",61611674,0,1,0,0,0,0,0,0,,,find production infringement sputnikv manufact...,"{'neg': 0.256, 'neu': 0.744, 'pos': 0.0, 'comp...",-0.4767
118270,1407682599515000832,2021-06-23 12:50:59+00:00,"{'id': 126591034, 'id_str': '126591034', 'name...",,When was the #SputnikV\n\n1. Exploratory Stage...,['sputnikv'],126591034,0,1,0,0,0,0,0,0,,,sputnikv exploratory stage animal eff muppets ...,"{'neg': 0.06, 'neu': 0.692, 'pos': 0.248, 'com...",0.7184


Sort dataset by sentiment and create a sub-dataset with location only:

In [14]:
clean_vaccine_tweets = clean_vaccine_tweets.sort_values("sentiment_compound", ascending=False, ignore_index=True)

In [15]:
geo_vaccine_tweets = clean_vaccine_tweets[clean_vaccine_tweets["user_location"].isnull() == False].reset_index(drop=True)

---

Export:

In [16]:
clean_vaccine_tweets.to_csv("../data/processed/vaccine_tweets_with_sentiment.csv")

In [17]:
geo_vaccine_tweets.to_csv("../data/processed/geo_vaccine_tweets_with_sentiment.csv")