In [246]:
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [265]:
# import data
pp_data = pd.read_csv('pp_data.csv')
pp_mean = pd.read_csv('pp_mean.csv')
tweet_data = pd.read_csv('tweets.csv')
news_data = pd.read_csv('google_news_headlines.csv')

# make date a datetime for merging
pp_mean['date'] = pd.to_datetime(pp_mean['date'])

In [266]:
# Punctuation characters except for exclamation point
punct_chars = '"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

def remove_punct(text):
    text  = "".join([char for char in text if char not in punct_chars])
    text = re.sub('[0-9]+', '', text)
    return text

tweet_data['tweet'] = tweet_data['tweet'].apply(lambda x: remove_punct(x))
news_data['headline'] = news_data['headline'].apply(lambda x: remove_punct(x))

In [267]:
# Replace newlines and hashes from tweets and newlines from headlines

tweet_data['tweet'] = tweet_data['tweet'].str.replace(r"\n", "")
#tweet_data['tweet'] = tweet_data['tweet'].str.replace("#", "")

news_data['headline'] = news_data['headline'].str[0].str.replace(r"\n", "")

Tweets

In [268]:
tweet_data.head()

Unnamed: 0,id,date,tweet
0,1377047151558070277,2021-03-30 23:56:39+00:00,How do you think Scott Morrison is going auspo...
1,1377046083403059204,2021-03-30 23:52:25+00:00,It seems he made the same announcement in June...
2,1377045549220696066,2021-03-30 23:50:17+00:00,AFR TLDR “How dare these women journalists get...
3,1377045473970679808,2021-03-30 23:49:59+00:00,I’ve pondered lately and Australians were know...
4,1377044770728538112,2021-03-30 23:47:12+00:00,Im a healthcare worker and I have just had my ...


In [269]:
tweets_list = tweet_data['tweet'].tolist()
# Example tweet
tweets_list[150]

'LetUsBackToAus JulianHillMP ScottMorrisonMP AlanTudgeMP Plz allow us back  aus plz 🇦🇺 when wil u allow us backsafe to do so whenwhen wil ur safe comeafter every  days u will do lockdownwhen it wil finishgives us date nd timeits an humble request 🙏😓'

In [270]:
# https://www.nltk.org/api/nltk.sentiment.html - there is a publication about vader
# compound is normalised between -1 and 1 with -1 being very negative
neg_tweets = []
comp_tweets = []

for i in range(len(tweets_list)):
    #analysis = TextBlob(tweet.text)
    sent = SentimentIntensityAnalyzer().polarity_scores(tweets_list[i])
    neg_tweets.append(sent['neg'])
    comp_tweets.append(sent['compound']) 
    
# Add negative and comp scores to tweets df
tweets = pd.DataFrame({'date': tweet_data['date'], 'tweet': tweet_data['tweet'], 'neg': neg_tweets,
                    'comp': comp_tweets})

In [99]:
# get month and year from date to group by
tweets['year'] = pd.DatetimeIndex(tweets['date']).year 
tweets['month'] = pd.DatetimeIndex(tweets['date']).month

tweets['month_year'] = tweets['month'].astype(str) + '-' + tweets['year'].astype(str)

In [129]:
# want mean sentiment and proportion of negative for each month
tweet_summary = tweets.groupby('month_year').agg(
    tweets_mean_comp = ('comp', 'mean'),
    tweets_prop_neg = ('neg', lambda x: (x != 0).sum() / x.count()),
    tweets_num_total = ('neg', 'count'),
    tweets_num_neg = ('neg', lambda x: (x != 0).sum())
)

In [146]:
tweet_summary.reset_index(level=0, inplace=True)
tweet_summary['date'] = pd.to_datetime(tweet_summary['month_year'])
del tweet_summary['month_year']

In [147]:
tweet_summary.to_csv('tweet_sentiment_by_month.csv', index=False)

News Headlines

In [179]:
news_data.head()

Unnamed: 0,date,headline
0,2019-05-18,Election 2019: Scott Morrison says 'I have alw...
1,2020-01-02,PM Scott Morrison defends climate policies and...
2,2020-01-02,Australian PM Scott Morrison angers residents
3,2020-01-02,Prime Minister Scott Morrison confronted by an...
4,2020-07-30,Prime Minister Scott Morrison 'very concerned'...


In [180]:
# make headlines a list
news_list = news_data['headline'].tolist()

# Example headline
news_list[150]

'Scott Morrison rejects calls for independent inquiry into rape allegation against Christian Porter'

In [183]:
neg_headlines = []
comp_headlines = []

for i in range(len(news_list)):
    sent = SentimentIntensityAnalyzer().polarity_scores(news_list[i])
    neg_headlines.append(sent['neg'])
    comp_headlines.append(sent['compound']) 
    
# Add negative and comp scores to tweets df
news = pd.DataFrame({'date': news_data['date'], 'headline': news_data['headline'], 'neg': neg_headlines,
                    'comp': comp_headlines})

In [184]:
news.head()

Unnamed: 0,date,headline,neg,comp
0,2019-05-18,Election 2019: Scott Morrison says 'I have alw...,0.0,0.0
1,2020-01-02,PM Scott Morrison defends climate policies and...,0.0,0.0
2,2020-01-02,Australian PM Scott Morrison angers residents,0.398,-0.5106
3,2020-01-02,Prime Minister Scott Morrison confronted by an...,0.317,-0.6249
4,2020-07-30,Prime Minister Scott Morrison 'very concerned'...,0.0,0.0


In [186]:
# get month and year from date to group by
news['year'] = pd.DatetimeIndex(news['date']).year
news['month'] = pd.DatetimeIndex(news['date']).month

news['month_year'] = news['month'].astype(str) + '-' + news['year'].astype(str)

In [187]:
# want mean sentiment and proportion of negative for each month
news_summary = news.groupby('month_year').agg(
    headlines_mean_comp = ('comp', 'mean'),
    headlines_prop_neg = ('neg', lambda x: (x != 0).sum() / x.count()),
    headlines_num_total = ('neg', 'count'),
    headlines_num_neg = ('neg', lambda x: (x != 0).sum())
)

In [189]:
news_summary.reset_index(level=0, inplace=True)
news_summary['date'] = pd.to_datetime(news_summary['month_year'])
del news_summary['month_year']

Now join tweets and headlines to polling data for modelling

In [196]:
data = pp_mean.merge(tweet_summary, on='date', how='left')
data = data.merge(news_summary, on='date', how='left')

In [197]:
print(len(pp_mean))
print(len(data))

22
22


In [198]:
data.to_csv('modelling_dataset.csv', index=False)
data.head()

Unnamed: 0,date,size,mor_sat,mor_dis,tweets_mean_comp,tweets_prop_neg,tweets_num_total,tweets_num_neg,headlines_mean_comp,headlines_prop_neg,headlines_num_total,headlines_num_neg
0,2019-06-01,,48.0,36.0,0.16921,0.515057,4815,2480.0,0.0,0.0,1,0.0
1,2019-07-01,,49.5,35.0,0.328865,0.638125,7019,4479.0,0.0103,1.0,2,2.0
2,2019-08-01,,48.0,39.5,0.314253,0.659883,7862,5188.0,-0.3389,1.0,2,2.0
3,2019-09-01,,48.333333,39.333333,0.146324,0.568913,8801,5007.0,-0.171733,0.666667,3,2.0
4,2019-10-01,,47.0,41.5,0.056276,0.482146,5629,2714.0,-0.05305,0.5,2,1.0
