In [1]:
# utilities
import re
import pickle
import numpy as np
import pandas as pd

# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# nltk
from nltk.stem import WordNetLemmatizer

In [3]:
# Importing the dataset

dataset=pd.read_csv("/Users/zhaomengshan/Desktop/sample.csv",encoding = "ISO-8859-1", engine='c')


In [5]:
for col in dataset.columns: 
    print(col) 

coordinates
created_at
hashtags
media
urls
favorite_count
id
in_reply_to_screen_name
in_reply_to_status_id
in_reply_to_user_id
lang
place
possibly_sensitive
retweet_count
retweet_id
retweet_screen_name
source
text
tweet_url
user_created_at
user_screen_name
user_default_profile_image
user_description
user_favourites_count
user_followers_count
user_friends_count
user_listed_count
user_location
user_name
user_screen_name.1
user_statuses_count
user_time_zone
user_urls
user_verified


In [None]:
dataset['lang'].describe()

In [13]:
# Only select the English Tweets
eng = dataset[(dataset["lang"]=="en")]

In [15]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

# Defining set containing all stopwords in english.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [17]:
def preprocess(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

In [21]:
import nltk
nltk.download('wordnet')
import time

text = list(eng['text'])

t = time.time()
processedtext = preprocess(text)
print(f'Text Preprocessing complete.')
print(f'Time Taken: {round(time.time()-t)} seconds')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhaomengshan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Text Preprocessing complete.
Time Taken: 71 seconds


In [64]:
text[1]

'RT @business: JUST IN: U.S. CDC recommends events of 50 people or more to be delayed for about 2 months  https://t.co/M5j5yKUygU'

In [71]:
# Calculate the Sentiment Score

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def calculate_sentiment_scores(sentence):
    sntmnt = analyzer.polarity_scores(sentence)['compound']
    return(sntmnt)


score = calculate_sentiment_scores("today is bad.")

In [73]:
score

-0.5423

In [75]:
covid_snt_score = []

for comment in text:
    snts_score = calculate_sentiment_scores(comment)
    covid_snt_score.append(snts_score)

In [77]:
covid_snt_score

[-0.25,
 -0.0679,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.25,
 0.3818,
 0.0,
 0.8834,
 0.0,
 0.5574,
 0.5719,
 0.5255,
 -0.2023,
 0.0,
 -0.6447,
 0.5526,
 0.2263,
 0.0,
 0.0258,
 0.4404,
 0.0,
 0.0,
 0.0,
 0.5106,
 0.128,
 0.2235,
 -0.0258,
 0.3612,
 -0.6249,
 0.6124,
 -0.6249,
 0.8398,
 -0.3818,
 0.5574,
 -0.6249,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.3612,
 -0.34,
 0.0516,
 -0.5574,
 -0.8555,
 -0.6705,
 0.7783,
 -0.5106,
 0.6249,
 0.0,
 -0.0516,
 -0.2808,
 0.0,
 0.0,
 -0.296,
 0.3182,
 0.0,
 0.0,
 -0.2732,
 -0.1531,
 0.0,
 0.0258,
 -0.3716,
 0.0,
 0.0,
 0.4019,
 0.0,
 -0.4329,
 -0.34,
 0.3182,
 -0.7798,
 -0.1419,
 0.0,
 -0.0772,
 0.0,
 0.0,
 0.0,
 -0.6786,
 0.0,
 0.0,
 -0.25,
 0.3254,
 0.6757,
 0.0,
 -0.1833,
 0.5023,
 0.0,
 -0.34,
 0.4588,
 0.0,
 0.0,
 0.816,
 0.0,
 0.3802,
 -0.5267,
 0.0,
 0.0,
 -0.2263,
 -0.34,
 0.3612,
 0.0,
 0.4019,
 0.5859,
 0.0,
 0.0,
 0.0,
 0.7351,
 -0.6872,
 0.0,
 -0.6124,
 0.7351,
 0.0,
 -0.5423,
 0.4019,
 -0.2481,
 0.0,
 -0.128,
 -0.5267,
 -0.3612,
 0.6486,
 0.5334,
 -0.296,
 0

In [78]:
eng['sentiment_score'] = np.array(covid_snt_score)
eng.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified,sentiment_score
0,,Mon Mar 16 00:00:01 +0000 2020,,,,0,1239340608478740481,,,,...,359,10,,Christine MacDonald,chrismac2562,183377,,,False,-0.25
1,,Mon Mar 16 00:00:00 +0000 2020,,,https://trib.al/HPjfbo4,0,1239340604913356800,,,,...,1803,598,"California, USA",Olga Sixta,OlgaSixta,163699,,,False,-0.0679
3,,Mon Mar 16 00:00:01 +0000 2020,COVIDã¼19,,https://twitter.com/JasonYanowitz/status/12389...,0,1239340608856240128,,,,...,362,49,Malaysia/ Asia/Europe/USA,Francissca Peter,FranticKL,66760,,https://www.facebook.com/FrancisscaPeterOfficial/,False,0.0
4,,Mon Mar 16 00:00:00 +0000 2020,,,,0,1239340604753981440,,,,...,842,10,,Black Lives Matter,HufflepuffOdair,60211,,,False,0.0
5,,Mon Mar 16 00:00:00 +0000 2020,,,,0,1239340604804521984,,,,...,3170,5,England,Ronnie Robinson,Goodisongreats,31494,,,False,0.0


In [83]:
# Change the score to sentiment
i = 0

vader_sentiment = [ ]

while(i<len(eng)):
    if ((eng.iloc[i]['sentiment_score'] >= 0.05)):
        vader_sentiment.append('positive')
        i = i+1
    elif ((eng.iloc[i]['sentiment_score'] > -0.05) & (eng.iloc[i]['sentiment_score'] < 0.05)):
        vader_sentiment.append('neutral')
        i = i+1
    elif ((eng.iloc[i]['sentiment_score'] <= -0.05)):
        vader_sentiment.append('negative')
        i = i+1


In [85]:
eng['vader_sentiment_labels'] = vader_sentiment

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [91]:
# Output the result
eng.to_csv(r'/Users/zhaomengshan/Desktop/sentiment.csv', index = False)

In [89]:
# Calculate the Percentage
percentage= eng.groupby(['created_at'])['vader_sentiment_labels'].describe()

In [90]:
percentage

Unnamed: 0_level_0,count,unique,top,freq
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri Apr 03 00:01:03 +0000 2020,31,3,positive,15
Fri Apr 03 00:01:04 +0000 2020,24,3,negative,11
Fri Apr 03 00:01:05 +0000 2020,28,3,neutral,12
Fri Apr 03 00:01:06 +0000 2020,33,3,positive,12
Fri Apr 03 00:01:07 +0000 2020,32,3,positive,11
...,...,...,...,...
Wed Mar 25 00:46:20 +0000 2020,23,3,positive,8
Wed Mar 25 00:46:21 +0000 2020,25,3,negative,10
Wed Mar 25 00:46:22 +0000 2020,24,3,positive,11
Wed Mar 25 00:46:23 +0000 2020,19,3,positive,8
