In [1]:
#Import Tweet Dataset
import pandas as pd
ruok_f = 'RUOK_Combined_Location.csv'
ruok=pd.read_csv(ruok_f)

In [2]:
#check the size of the tweet dataset
print(ruok.shape)
#check the column name of the tweet dataset
print(ruok.columns)

(24798, 28)
Index(['query', 'query_type', 'id', 'created_at', 'user_screen_name', 'source',
       'lang', 'retweet_count', 'favorite_count', 'in_reply_to_screen_name',
       'place', 'coordinates', 'user.time_zone', 'full_text', 'hashtags',
       'user_mentions', 'in_reply_to_status_id', 'place_type', 'place_name',
       'place_full_name', 'country_code', 'country', 'contained_within',
       'coordinate_box', 'x_coor', 'y_coor', 'reply_count', 'level'],
      dtype='object')


In [3]:
ruok['full_text'].head(3)

0    RT @ruokday: #RUOKDay2019 is Thursday 12 Septe...
1    RT @ruokday: #RUOKDay2019 is September 12. \xe...
2    RT @ruokday: #RUOKDay2019 is September 12. \xe...
Name: full_text, dtype: object

# Data Pre-processing

1. Remove Twitter handlers & URL
2. Remove punctuations
3. Tokenisation
4. Remove Stopwords
5. Stemming

In [4]:
import numpy as np
import re
import string
import nltk 
nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fengyifan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
#remove Twitter Handles (@user)
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt  

In [6]:
ruok['tidy_tweet'] = np.vectorize(remove_pattern)(ruok['full_text'], "@[\w]*")
ruok['tidy_tweet'].head(3)

0    RT : #RUOKDay2019 is Thursday 12 September. Re...
1    RT : #RUOKDay2019 is September 12. \xe2\x9d\xa...
2    RT : #RUOKDay2019 is September 12. \xe2\x9d\xa...
Name: tidy_tweet, dtype: object

In [7]:
#remove url
def remove_url(tweet):
    
    # remove URL
    tweet = re.sub(r"http\S+", "", tweet)
    
    return tweet

In [8]:
ruok['tidy_tweet'] = ruok['tidy_tweet'].apply(remove_url)

In [9]:
print(string.punctuation)
#remove punctuations
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [10]:
ruok['tidy_tweet'] = ruok['tidy_tweet'].apply(lambda x: remove_punct(x))

In [11]:
ruok['tidy_tweet'].head(3)

0    RT  RUOKDay is Thursday  September Retweet thi...
1    RT  RUOKDay is September  xexdxaxefxbxf this t...
2    RT  RUOKDay is September  xexdxaxefxbxf this t...
Name: tidy_tweet, dtype: object

In [12]:
#tokenisation
def tokenization(text):
    text = re.split('\W+', text)
    return text

In [13]:
ruok['Tweet_tokenized'] = ruok['tidy_tweet'].apply(lambda x: tokenization(x.lower()))
ruok['Tweet_tokenized'].head(3)

0    [rt, ruokday, is, thursday, september, retweet...
1    [rt, ruokday, is, september, xexdxaxefxbxf, th...
2    [rt, ruokday, is, september, xexdxaxefxbxf, th...
Name: Tweet_tokenized, dtype: object

In [14]:
#remove stop words
stopword = nltk.corpus.stopwords.words('english')
stopword.extend(['xe2','x80','xa6','b','b\'rt','xf0','x9f','n','thi','x91','x99','wa','ha','x89','x9d','x8f','his.w',
               'nhttp','x98','x8e','xb8','x92','xa4','x8c','xa8','x94','x9c','xa5','see','xa7','v\'','x86','xb6','...',
               'x90','xb7','x8d','x87','xec','x83','xa9','xb0','x89http','TRUE','x8b','x85','x88','x82','x96','x94a',
               'x9a','xb5','x9e','xbf','xaf','xeb','xac','xed','xbc','xad','x98lover','x99m','x93','xc2','x8a','x97',
               'xa2','x95','xb9','xbd','x81','xa1','x99re','x99re','xba','x9cthi','x99ve','xa3','x99all','xc3','xb3',
               'x9cthe','x99ll','xaa','x9clover','x9cyou','x98new','x9c16','xef','x99t','xa0','spotify','https','x99s',
               'nthis','htt','nhttps','co','hi','xaq','rt','also','us','go','sp','let','put','get','got','x84','ncheck',
               'xbb','x9c','x94','x9','th','gladysb','xexdxaxefxbxf','httxexxa',"ruoxexxa",
               '5v9mup3qzt','x9care','xgdvgco','x8fthis'])
def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

In [15]:
ruok['Tweet_nonstop'] = ruok['Tweet_tokenized'].apply(lambda x: remove_stopwords(x))
ruok['Tweet_nonstop'].head(3)

0    [ruokday, thursday, september, retweet, tweet,...
1    [ruokday, september, tweet, reminder, day, inf...
2    [ruokday, september, tweet, reminder, day, inf...
Name: Tweet_nonstop, dtype: object

In [16]:
#stemming
ps = nltk.PorterStemmer()
def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

In [17]:
ruok['Tweet_stem'] = ruok['Tweet_nonstop'].apply(lambda x: stemming(x))
ruok['Tweet_stem'].head(3)

0    [ruokday, thursday, septemb, retweet, tweet, r...
1    [ruokday, septemb, tweet, remind, day, info, h...
2    [ruokday, septemb, tweet, remind, day, info, h...
Name: Tweet_stem, dtype: object

# Vader Sentiment

In [18]:
#convert stemmed tweets to strings
def list_to_string(text):
    text = ' '.join(text)
    return text

In [19]:
ruok['Tweet_string'] = ruok['Tweet_stem'].apply(lambda x: list_to_string(x))
ruok[['Tweet_stem','Tweet_string']].head(3)

Unnamed: 0,Tweet_stem,Tweet_string
0,"[ruokday, thursday, septemb, retweet, tweet, r...",ruokday thursday septemb retweet tweet remind ...
1,"[ruokday, septemb, tweet, remind, day, info, h...",ruokday septemb tweet remind day info help lea...
2,"[ruokday, septemb, tweet, remind, day, info, h...",ruokday septemb tweet remind day info help lea...


In [20]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [21]:
#Vader sentiment analysis
analyzer = SentimentIntensityAnalyzer()
#Compound score between -1 (most negative) and 1 (most positive)
ruok['sentiment_compound_polarity']=ruok.Tweet_string.apply(lambda x:analyzer.polarity_scores(x)['compound'])
ruok['sentiment_neutral']=ruok.Tweet_string.apply(lambda x:analyzer.polarity_scores(x)['neu'])
ruok['sentiment_negative']=ruok.Tweet_string.apply(lambda x:analyzer.polarity_scores(x)['neg'])
ruok['sentiment_pos']=ruok.Tweet_string.apply(lambda x:analyzer.polarity_scores(x)['pos'])
ruok['sentiment_type']=''
#mark the sentiment type for each tweet
ruok.loc[ruok.sentiment_compound_polarity>0,'sentiment_type']='Positive'
ruok.loc[ruok.sentiment_compound_polarity==0,'sentiment_type']='Neutral'
ruok.loc[ruok.sentiment_compound_polarity<0,'sentiment_type']='Negative'

In [22]:
ruok['sentiment_type'].value_counts()

Positive    16371
Neutral      5059
Negative     3368
Name: sentiment_type, dtype: int64

In [23]:
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [86]:
#pie chart
labels = ['Positive', 'Neutral', 'Negative']
colors = ['blue', 'gray', 'red']
values = [16221,4987,3347]
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.update_traces(marker=dict(colors=colors))
fig.update_layout(height=500,width=500)
fig.show()

In [25]:
#convert to the date format
ruok['created_at'] = pd.to_datetime(ruok['created_at'])
#keep only the date
ruok['created_at_date'] = ruok['created_at'].dt.date
#keep only the hour
ruok['created_at_hour'] = ruok['created_at'].dt.hour

In [26]:
#Mean compound polarity by date
senti_date= ruok[['created_at_date','sentiment_compound_polarity']]
senti_date = senti_date.groupby("created_at_date",as_index=True).sentiment_compound_polarity.mean()
#print(senti_date)

In [27]:
# Mean compound polarity by hour
senti_hour = ruok[['created_at_hour','sentiment_compound_polarity']]
senti_hour = senti_hour.groupby("created_at_hour",as_index=True).sentiment_compound_polarity.mean()
#print(senti_hour)

In [87]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(x = senti_date.index,
                         y = senti_date.values,
                         mode = "lines+markers",
                         name = "citations"), row=1, col=1)

fig.add_trace(go.Scatter(x = senti_hour.index,
                         y = senti_hour.values,
                         mode = "lines+markers",
                        name = "citations"), row=1, col=2)

fig.update_xaxes(title_text="Hour",range=[-0.5,24],row=1, col=2)
fig.update_xaxes(title_text="Date",range=['2019-09-02','2019-10-06'], row=1, col=1)

fig.update_yaxes(title_text="Average Compound Scores", range=[-0.1,0.65], row=1, col=1)
fig.update_yaxes(range=[-0.1,0.65], row=1, col=2)

fig.update_layout(
    shapes=[
        # Line Vertical
        go.layout.Shape(
            type="line",
            x0="2019-09-12",
            y0=-0.1,
            x1="2019-09-12",
            y1=0.65,
            line=dict(color="RoyalBlue",width=3,dash="dot"),
            layer='above')
    ]
)
fig.update_layout(height=500, width=1000,showlegend=False)
#fig.update_layout(height=500, width=1000, title_text="Vader Sentiment Analysis: Average Compound Scores",showlegend=False)
fig.show()

In [29]:
# Heat Map: Mean Polarity by location
geo = pd.read_csv("location_geocode.csv")
geo.head(10)

Unnamed: 0,name,lat,long
0,"Brisbane, Queensland",-27.469771,153.025124
1,"Wollongong, NSW, AUSTRALIA",-34.427812,150.893061
2,Melbourne,-37.813628,144.963058
3,Philippines,12.879721,121.774017
4,Australia,-25.274398,133.775136
5,"Nevada, USA",38.80261,-116.419389
6,Melbourne Australia,-37.813628,144.963058
7,United States,37.09024,-95.712891
8,"California,orlando",28.485102,-81.4323
9,"Lower Saxony, Germany",52.636704,9.845077


In [30]:
ruok_place = ruok.dropna(subset=['place_full_name'])
ruok_place = ruok_place.merge(geo, how='inner', left_on='place_full_name', right_on='name')

In [31]:
#ruok_place.to_csv('ruok_place.csv')

In [32]:
#citygraph
city_df=ruok_place[['place_name','created_at_date','sentiment_compound_polarity','sentiment_type']]
city_df['place_name'].replace('Perth (WA)','Perth',inplace=True)
city_senti = city_df[city_df.sentiment_type.isin(['Positive','Negative'])]
#select cities with the most number of tweets
cities = ['Sydney', 'Melbourne','Brisbane','Canberra','Perth','Gold Coast','Adelaide']
city_df= city_df[city_df.place_name.isin(cities)]
print(city_df['place_name'].value_counts())

Sydney        124
Melbourne     111
Brisbane       61
Perth          37
Canberra       22
Gold Coast     20
Adelaide       11
Name: place_name, dtype: int64


In [33]:
import plotly.express as px
city_senti = city_df[city_df.sentiment_type.isin(['Positive','Negative'])]
fig = px.bar(city_senti, x="place_name", y="sentiment_compound_polarity", color='sentiment_type')
fig.update_xaxes(title='City')
fig.update_yaxes(title='Sum of Compound Sentiment Scores')
fig.update_layout(title_text="Vader Sentiment Analysis: Sum of Sentiment Compound Scores")
fig.show()

In [34]:
city_df = city_df.groupby(['created_at_date','place_name']).agg({'sentiment_compound_polarity': 'mean'}).reset_index()

In [35]:
bris_df=city_df[city_df.place_name=='Brisbane']
melb_df=city_df[city_df.place_name=='Melbourne']
sy_df=city_df[city_df.place_name=='Sydney']
can_df=city_df[city_df.place_name=='Canberra']

In [36]:
fig = make_subplots(rows=2, cols=2, subplot_titles=("Melbourne", "Sydney", "Brisbane", 
                                                    "Canberra"))

fig.append_trace(go.Scatter(
    x = melb_df.created_at_date,
    y = melb_df.sentiment_compound_polarity,
    mode="lines"), row=1, col=1)

fig.append_trace(go.Scatter(
    x = bris_df.created_at_date,
    y = bris_df.sentiment_compound_polarity,
    mode="lines"), row=2, col=1)

fig.append_trace(go.Scatter(
    x = sy_df.created_at_date,
    y = sy_df.sentiment_compound_polarity,
    mode="lines"), row=1, col=2)

fig.append_trace(go.Scatter(
    x = can_df.created_at_date,
    y = can_df.sentiment_compound_polarity,
    mode="lines"), row=2, col=2)

fig.update_yaxes(range=[-1,1],row=1, col=1)
fig.update_yaxes(range=[-1,1],row=2, col=1)
fig.update_yaxes(range=[-1,1],row=1, col=2)
fig.update_yaxes(range=[-1,1],row=2, col=2)

fig.update_xaxes(range=['2019-09-03','2019-10-05'],row=1, col=2)
fig.update_xaxes(range=['2019-09-03','2019-10-05'],row=2, col=1)
fig.update_xaxes(range=['2019-09-03','2019-10-05'],row=1, col=1)
fig.update_xaxes(range=['2019-09-03','2019-10-05'],row=2, col=2)

fig.update_layout(height=800, width=900, title_text="Vader Sentiment Analysis: Average Compound Scores by City",showlegend=False)
fig.show()

In [37]:
map_df = ruok_place[['place_full_name','lat','long','sentiment_compound_polarity']]
map_df['place_full_name'].replace('Perth, Perth (WA)','Perth, Western Australia',inplace=True)
map_df = map_df.groupby(['place_full_name','lat', 'long']).agg({'sentiment_compound_polarity': 'mean'}).reset_index()
#create a new column to indicate the sentiment type
map_df.loc[map_df.sentiment_compound_polarity>0,'sentiment_type']='Positive'
map_df.loc[map_df.sentiment_compound_polarity==0,'sentiment_type']='Neutral'
map_df.loc[map_df.sentiment_compound_polarity<0,'sentiment_type']='Negative'

In [38]:
map_df['sentiment_type'].value_counts()

Positive    60
Neutral     10
Negative    10
Name: sentiment_type, dtype: int64

In [39]:
map_df=map_df[map_df.sentiment_type!='Neutral']
map_df['polarity_abs']=map_df['sentiment_compound_polarity'].abs()
map_df['polarity_abs']=map_df['polarity_abs']*20
map_df.shape

(70, 6)

In [40]:
import folium 
import webbrowser

In [41]:
latitude = -25.734968
longitude = 134.489563
sentiment_map = folium.Map(location=[latitude, longitude], zoom_start=3)

In [42]:
colordict = {'Positive': 'blue','Negative': 'red'}
for lat, lon,sentiment_type, polarity_abs,place, polarity_original in zip(map_df['lat'], map_df['long'], 
                         map_df['sentiment_type'],map_df['polarity_abs'],map_df['place_full_name'],
                         map_df['sentiment_compound_polarity']):
    folium.CircleMarker(
        [lat, lon],
        radius=polarity_abs,
        popup = (str(place)+":"+str(polarity_original)),
        color='b',
        fill_color=colordict[sentiment_type],
        fill=True,
        fill_opacity=0.7
        ).add_to(sentiment_map)
sentiment_map
#click to the circe marker to see the place name and the compound polarity score

# NRC Sentiment

In [43]:
#create a column of unique ID for further merging datasets purpose
ruok['UniqueID'] = np.arange(len(ruok))
ruok.to_csv('ruok.csv')

In [44]:
from tqdm import tqdm_notebook as tqdm
import csv
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [45]:
input_csv_filename='ruok.csv'

data = {'text':[],'UniqueID': []}

with open(input_csv_filename) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count != 0:
            data['text'].append(row[33])
            data['UniqueID'].append(row[0])
        line_count += 1
    print(f'Processed {line_count} lines.')
df = pd.DataFrame(data=data)


def text_emotion(df, column):
    '''
    Takes a DataFrame and a specified column of text and adds 10 columns to the
    DataFrame for each of the 10 emotions in the NRC Emotion Lexicon, with each
    column containing the value of the text in that emotions
    INPUT: DataFrame, string
    OUTPUT: the original DataFrame with ten new columns
    '''

    new_df = df.copy()
    #read the nrc lexicon file 
    filepath = ('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
    emolex_df = pd.read_csv(filepath,
                            names=["word", "emotion", "association"],
                            sep='\t')
    emolex_words = emolex_df.pivot(index='word',
                                   columns='emotion',
                                   values='association').reset_index()
    emotions = emolex_words.columns.drop('word')
    emo_df = pd.DataFrame(0, index=df.index, columns=emotions)

    stemmer = SnowballStemmer("english")

    
    with tqdm(total=len(list(new_df.iterrows()))) as pbar:
        for i, row in new_df.iterrows():
            pbar.update(1)
            document = word_tokenize(new_df.loc[i][column])
            for word in document:
                word = stemmer.stem(word.lower())
                emo_score = emolex_words[emolex_words.word == word]
                if not emo_score.empty:
                    for emotion in list(emotions):
                        emo_df.at[i, emotion] += emo_score[emotion]

    new_df = pd.concat([new_df, emo_df], axis=1)

    return new_df

result_df = text_emotion(df, 'text')

Processed 24799 lines.


HBox(children=(IntProgress(value=0, max=24798), HTML(value='')))




In [46]:
result_df.head(3)

Unnamed: 0,text,UniqueID,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,ruokday thursday septemb retweet tweet remind ...,0,0,0,0,0,0,0,0,0,0,0
1,ruokday septemb tweet remind day info help lea...,1,0,0,0,0,0,0,1,0,0,0
2,ruokday septemb tweet remind day info help lea...,2,0,0,0,0,0,0,1,0,0,0


In [47]:
result_df.to_csv('nrc_result.csv')

In [48]:
nrc_f = 'nrc_result.csv'
nrc_result=pd.read_csv(nrc_f)

In [49]:
#merge the nrc lexicon results table with the original dataset
nrc = ruok.merge(nrc_result, how='inner', left_on='UniqueID', right_on='UniqueID')

In [50]:
nrc.columns

Index(['query', 'query_type', 'id', 'created_at', 'user_screen_name', 'source',
       'lang', 'retweet_count', 'favorite_count', 'in_reply_to_screen_name',
       'place', 'coordinates', 'user.time_zone', 'full_text', 'hashtags',
       'user_mentions', 'in_reply_to_status_id', 'place_type', 'place_name',
       'place_full_name', 'country_code', 'country', 'contained_within',
       'coordinate_box', 'x_coor', 'y_coor', 'reply_count', 'level',
       'tidy_tweet', 'Tweet_tokenized', 'Tweet_nonstop', 'Tweet_stem',
       'Tweet_string', 'sentiment_compound_polarity', 'sentiment_neutral',
       'sentiment_negative', 'sentiment_pos', 'sentiment_type',
       'created_at_date', 'created_at_hour', 'UniqueID', 'Unnamed: 0', 'text',
       'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative',
       'positive', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [51]:
nrc = nrc[['place_full_name','place_name','Tweet_string','created_at_date','anger',
       'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive',
       'sadness', 'surprise', 'trust']]

In [52]:
nrc.to_csv('nrc.csv')

In [53]:
from nltk import tokenize
nrc['word_count'] = nrc['Tweet_string'].apply(tokenize.word_tokenize).apply(len)

In [54]:
nrc.head(3)

Unnamed: 0,place_full_name,place_name,Tweet_string,created_at_date,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
0,,,ruokday thursday septemb retweet tweet remind ...,2019-09-07,0,0,0,0,0,0,0,0,0,0,10
1,,,ruokday septemb tweet remind day info help lea...,2019-09-07,0,0,0,0,0,0,1,0,0,0,10
2,,,ruokday septemb tweet remind day info help lea...,2019-09-07,0,0,0,0,0,0,1,0,0,0,10


In [55]:
emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

In [56]:
#calculate the ratio: the number of words associated with emotions by the total word count for each tweet
for emotion in emotions:
    nrc[emotion] = nrc[emotion] / nrc['word_count']

In [57]:
nrc.head()

Unnamed: 0,place_full_name,place_name,Tweet_string,created_at_date,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,word_count
0,,,ruokday thursday septemb retweet tweet remind ...,2019-09-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
1,,,ruokday septemb tweet remind day info help lea...,2019-09-07,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,10
2,,,ruokday septemb tweet remind day info help lea...,2019-09-07,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,10
3,,,ruokday septemb tweet remind day info help lea...,2019-09-07,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,10
4,,,ruokday septemb tweet remind day info help lea...,2019-09-07,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,10


In [58]:
#calculate the proportions of words associated with each emotion
word_total = nrc['word_count'].sum()
anger= nrc['anger'].sum()/word_total
anticipation = nrc['anticipation'].sum()/word_total
disgust = nrc['disgust'].sum()/word_total
fear = nrc['fear'].sum()/word_total
joy = nrc['joy'].sum()/word_total
sadness = nrc['sadness'].sum()/word_total
surprise = nrc['surprise'].sum()/word_total
trust = nrc['trust'].sum()/word_total
#positive = nrc['positive'].sum()/word_total
#negative = nrc['negative'].sum()/word_total

In [85]:
emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy','sadness', 'surprise', 'trust']
value = [anger,anticipation,disgust,fear,joy,sadness,surprise,trust]
data = [go.Bar(
    x=emotions,
    y=value
)]
layout = {
    'xaxis': {
        'categoryorder': 'array',
        'autorange':'reversed',
        'categoryarray': [x for _, x in sorted(zip(value, emotions))]
    }
}
fig = go.FigureWidget(data=data, layout=layout)

fig.update_xaxes(title_text="Emotion")
fig.update_yaxes(title_text="Ratio")
fig.update_layout(height=500, width=700,showlegend=False)
fig

FigureWidget({
    'data': [{'type': 'bar',
              'uid': '0977e640-8b8d-4a0f-aa2e-fbea638a848e',
     …

In [60]:
nrc_date = nrc[['created_at_date','anger', 'anticipation', 
                'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']]
#gather the dataset
nrc_date = pd.melt(nrc_date,id_vars='created_at_date',var_name='emotion',value_name='average_sentiment')
#get the average sentiment by date and emotion
nrc_date = nrc_date.groupby(['created_at_date', 'emotion']).agg({'average_sentiment': 'mean'}).reset_index()
nrc_date['created_at_date'] = pd.to_datetime(nrc_date['created_at_date'])

In [61]:
nrc_date.head(5)

Unnamed: 0,created_at_date,emotion,average_sentiment
0,2019-09-03,anger,0.041667
1,2019-09-03,anticipation,0.0
2,2019-09-03,disgust,0.03125
3,2019-09-03,fear,0.0
4,2019-09-03,joy,0.0


In [62]:
import plotly.express as px
fig = px.line(nrc_date, x="created_at_date", y="average_sentiment", color='emotion')
fig.update_xaxes(title_text="Date",range=['2019-09-02','2019-10-06'])
fig.update_yaxes(title_text="Average Sentiment",range=['2019-09-02','2019-10-06'])
fig.update_layout(title_text="NRC Lexicon: Emotions over Date")
fig.show()

In [90]:
fig = px.line(nrc_date[nrc_date.emotion.isin (['sadness','fear'])], 
              x="created_at_date", y="average_sentiment", color='emotion')
fig.update_xaxes(title_text="Date",range=['2019-09-02','2019-10-06'])
fig.update_yaxes(title_text="Ratio",range=['2019-09-02','2019-10-06'])
#fig.update_layout(title_text="NRC Lexicon: Sadness and Fear over Date")
fig.update_layout(title_text="Sadness and Fear")
fig.show()

In [93]:
nrc_anger = nrc_date[nrc_date.emotion=='anger']
nrc_anticipation = nrc_date[nrc_date.emotion=='anticipation']
nrc_disgust = nrc_date[nrc_date.emotion=='disgust']
nrc_fear = nrc_date[nrc_date.emotion=='fear']
nrc_joy = nrc_date[nrc_date.emotion=='joy']
nrc_sadness= nrc_date[nrc_date.emotion=='sadness']
nrc_surprise = nrc_date[nrc_date.emotion=='surprise']
nrc_trust = nrc_date[nrc_date.emotion=='trust']

In [94]:
#multi-facted line charts
fig = make_subplots(rows=4, cols=2, subplot_titles=("Anger", "Joy", "Anticipation", 
                                                    "Sadness","Disgust","Surprise",
                                                   "Fear","Trust"))

fig.append_trace(go.Scatter(
    x = nrc_anger.created_at_date,
    y = nrc_anger.average_sentiment,
    mode = "lines"), row=1, col=1)

fig.append_trace(go.Scatter(
    x = nrc_anticipation.created_at_date,
    y = nrc_anticipation.average_sentiment,
    mode = "lines"), row=2, col=1)

fig.append_trace(go.Scatter(
    x = nrc_disgust.created_at_date,
    y = nrc_disgust.average_sentiment,
    mode = "lines"), row=3, col=1)

fig.append_trace(go.Scatter(
    x = nrc_fear.created_at_date,
    y = nrc_fear.average_sentiment,
    mode = "lines"), row=4, col=1)

fig.append_trace(go.Scatter(
    x = nrc_joy.created_at_date,
    y = nrc_joy.average_sentiment,
    mode = "lines"), row=1, col=2)

fig.append_trace(go.Scatter(
    x = nrc_sadness.created_at_date,
    y = nrc_sadness.average_sentiment,
    mode = "lines"), row=2, col=2)

fig.append_trace(go.Scatter(
    x = nrc_surprise.created_at_date,
    y = nrc_surprise.average_sentiment,
    mode = "lines"), row=3, col=2)

fig.append_trace(go.Scatter(
    x = nrc_trust.created_at_date,
    y = nrc_trust.average_sentiment,
    mode = "lines"), row=4, col=2)

fig.update_yaxes(range=[0,.1],row=1, col=1)
fig.update_yaxes(range=[0,.1],row=2, col=1)
fig.update_yaxes(range=[0,.1],row=3, col=1)
fig.update_yaxes(range=[0,.1],row=4, col=1)
fig.update_yaxes(range=[0,.1],row=1, col=2)
fig.update_yaxes(range=[0,.1],row=2, col=2)
fig.update_yaxes(range=[0,.1],row=3, col=2)
fig.update_yaxes(range=[0,.1],row=4, col=2)

fig.update_layout(height=800, width=700,showlegend=False)
#fig.update_layout(height=800, width=700, title_text="NRC Lexicon: Emotions over Date",showlegend=False)
fig.show()

In [66]:
nrc_city = nrc.merge(geo, how='inner', left_on='place_full_name', right_on='name')
#nrc_date = pd.melt(nrc_date,id_vars='created_at_date',var_name='emotion',value_name='average_sentiment')

In [67]:
nrc_city=nrc_city[['place_name','anger', 'anticipation', 'disgust', 'fear', 'joy','sadness', 'surprise', 'trust']]
#keep only the major cities for further visualisations
cities = ['Sydney', 'Melbourne','Brisbane','Canberra','Adelaide','Perth (WA)','Perth']
nrc_city= nrc_city[nrc_city.place_name.isin(cities)]
nrc_city['place_name'].replace('Perth (WA)','Perth',inplace=True)
nrc_city['place_name'].value_counts()

Sydney       124
Melbourne    111
Brisbane      61
Perth         37
Canberra      22
Adelaide      11
Name: place_name, dtype: int64

In [68]:
nrc_city= pd.melt(nrc_city,id_vars='place_name',var_name='emotion',value_name='average_sentiment')
nrc_city = nrc_city.groupby(['place_name', 'emotion']).agg({'average_sentiment': 'mean'}).reset_index()

In [69]:
nrc_anger = nrc_city[nrc_city.emotion=='anger']
nrc_anticipation = nrc_city[nrc_city.emotion=='anticipation']
nrc_disgust = nrc_city[nrc_city.emotion=='disgust']
nrc_fear = nrc_city[nrc_city.emotion=='fear']
nrc_joy = nrc_city[nrc_city.emotion=='joy']
nrc_sadness = nrc_city[nrc_city.emotion=='sadness']
nrc_surprise = nrc_city[nrc_city.emotion=='surprise']
nrc_trust = nrc_city[nrc_city.emotion=='trust']

In [70]:
x=['Adelaide','Brisbane','Canberra','Melbourne','Perth','Sydney']
fig = go.Figure(go.Bar(x =x, y=nrc_anger.average_sentiment, name='Anger'))
fig.add_trace(go.Bar(x =x, y=nrc_anticipation.average_sentiment, name='Anticipation'))
fig.add_trace(go.Bar(x =x, y=nrc_disgust.average_sentiment, name='Disgust'))
fig.add_trace(go.Bar(x =x, y=nrc_fear.average_sentiment, name='Fear'))
fig.add_trace(go.Bar(x =x, y=nrc_joy.average_sentiment, name='Joy'))
fig.add_trace(go.Bar(x =x, y=nrc_sadness.average_sentiment, name='Sadness'))
fig.add_trace(go.Bar(x =x, y=nrc_surprise.average_sentiment, name='Surprise'))
fig.add_trace(go.Bar(x =x, y=nrc_trust.average_sentiment, name='Trust'))

fig.update_yaxes(title='Ratio')

fig.update_layout(barmode='stack', xaxis={'categoryorder':'category ascending'},
                 title_text='NRC Lexicon: Emotions by City')
fig.show()

In [71]:
# Sadness Map
sad_map = nrc.merge(geo, how='inner', left_on='place_full_name', right_on='name')
sad_map= pd.melt(sad_map,id_vars='place_full_name',var_name='emotion',value_name='average_sentiment')
sad_map = sad_map[sad_map.emotion=='sadness']
sad_map['average_sentiment']=pd.to_numeric(sad_map['average_sentiment'])
sad_map.head(3)

Unnamed: 0,place_full_name,emotion,average_sentiment
5160,"Cronulla, Sydney",sadness,0.0
5161,"Sydney, New South Wales",sadness,0.0
5162,"Sydney, New South Wales",sadness,0.0


In [72]:
sad_map = sad_map.groupby(['place_full_name']).agg({'average_sentiment': 'mean'}).reset_index()

In [73]:
sad_map = sad_map.merge(geo, how='inner', left_on='place_full_name', right_on='name')
sad_map.shape

(81, 5)

In [74]:
sad_map=sad_map[sad_map.average_sentiment!=0]

In [75]:
latitude = -25.734968
longitude = 134.489563
sentiment_map_sad = folium.Map(location=[latitude, longitude], zoom_start=3)
for lat, lon,sentiment,place in zip(sad_map['lat'], sad_map['long'], 
                                         sad_map['average_sentiment'],sad_map['place_full_name']):
    folium.CircleMarker(
        [lat, lon],
        radius=sentiment*50,
        popup = (str(place)+":"+str(sentiment)),
        color='b',
        fill_color='red',
        fill=True,
        fill_opacity=0.7
        ).add_to(sentiment_map_sad)
#click on the circle marker to see the location name and the average proportion of sadness words in the tweets
sentiment_map_sad