In [38]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [39]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
#read in only the Tweet and Sentiment column from csv
df = pd.read_csv('fifa_world_cup_2022_tweets.csv', usecols=['Tweet','Sentiment'])
df

Unnamed: 0,Tweet,Sentiment
0,What are we drinking today @TucanTribe \n@MadB...,neutral
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive
2,Worth reading while watching #WorldCup2022 htt...,positive
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive
4,"If the BBC cares so much about human rights, h...",negative
...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral
22521,Great thread to read before the start of #Worl...,positive
22522,Raphinha wants Brazil to be united at the #Wor...,positive


In [41]:
#change all tweets to lower case
df['Tweet Lower'] = df['Tweet'].str.lower()
df

Unnamed: 0,Tweet,Sentiment,Tweet Lower
0,What are we drinking today @TucanTribe \n@MadB...,neutral,what are we drinking today @tucantribe \n@madb...
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive,amazing @canadasocceren #worldcup2022 launch ...
2,Worth reading while watching #WorldCup2022 htt...,positive,worth reading while watching #worldcup2022 htt...
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive,golden maknae shinning bright\n\nhttps://t.co/...
4,"If the BBC cares so much about human rights, h...",negative,"if the bbc cares so much about human rights, h..."
...,...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive,here we go world cup 2022 #worldcup2022
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral,anderlecht confirms former viborg ff's jesper ...
22521,Great thread to read before the start of #Worl...,positive,great thread to read before the start of #worl...
22522,Raphinha wants Brazil to be united at the #Wor...,positive,raphinha wants brazil to be united at the #wor...


In [42]:
#remove all URLs and whitespace in tweet lower column
df['Tweet Clean'] = df['Tweet Lower'].str.replace(r'https?:\/\/[^\s]*', '', regex=True,flags=re.MULTILINE).str.strip()

df

Unnamed: 0,Tweet,Sentiment,Tweet Lower,Tweet Clean
0,What are we drinking today @TucanTribe \n@MadB...,neutral,what are we drinking today @tucantribe \n@madb...,what are we drinking today @tucantribe \n@madb...
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive,amazing @canadasocceren #worldcup2022 launch ...,amazing @canadasocceren #worldcup2022 launch ...
2,Worth reading while watching #WorldCup2022 htt...,positive,worth reading while watching #worldcup2022 htt...,worth reading while watching #worldcup2022
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive,golden maknae shinning bright\n\nhttps://t.co/...,golden maknae shinning bright\n\n\n#jeonjungko...
4,"If the BBC cares so much about human rights, h...",negative,"if the bbc cares so much about human rights, h...","if the bbc cares so much about human rights, h..."
...,...,...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive,here we go world cup 2022 #worldcup2022,here we go world cup 2022 #worldcup2022
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral,anderlecht confirms former viborg ff's jesper ...,anderlecht confirms former viborg ff's jesper ...
22521,Great thread to read before the start of #Worl...,positive,great thread to read before the start of #worl...,great thread to read before the start of #worl...
22522,Raphinha wants Brazil to be united at the #Wor...,positive,raphinha wants brazil to be united at the #wor...,raphinha wants brazil to be united at the #wor...


In [43]:
#verify URLS have been removed
print(df.iloc[0]['Tweet Clean'])

what are we drinking today @tucantribe 
@madbears_ 
@lkinc_algo 
@al_goanna 

#worldcup2022


In [44]:
#remove all username mentions from Tweet Clean
df['Tweet Clean'] = df['Tweet Clean'].str.replace(r'@[\w]*', '', regex=True,flags=re.MULTILINE).str.strip()

df

Unnamed: 0,Tweet,Sentiment,Tweet Lower,Tweet Clean
0,What are we drinking today @TucanTribe \n@MadB...,neutral,what are we drinking today @tucantribe \n@madb...,what are we drinking today \n \n \n \n\n#worl...
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive,amazing @canadasocceren #worldcup2022 launch ...,amazing #worldcup2022 launch video. shows ho...
2,Worth reading while watching #WorldCup2022 htt...,positive,worth reading while watching #worldcup2022 htt...,worth reading while watching #worldcup2022
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive,golden maknae shinning bright\n\nhttps://t.co/...,golden maknae shinning bright\n\n\n#jeonjungko...
4,"If the BBC cares so much about human rights, h...",negative,"if the bbc cares so much about human rights, h...","if the bbc cares so much about human rights, h..."
...,...,...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive,here we go world cup 2022 #worldcup2022,here we go world cup 2022 #worldcup2022
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral,anderlecht confirms former viborg ff's jesper ...,anderlecht confirms former viborg ff's jesper ...
22521,Great thread to read before the start of #Worl...,positive,great thread to read before the start of #worl...,great thread to read before the start of #worl...
22522,Raphinha wants Brazil to be united at the #Wor...,positive,raphinha wants brazil to be united at the #wor...,raphinha wants brazil to be united at the #wor...


In [45]:
#verify mentions have been removed
print(df.iloc[0]['Tweet Clean'])

what are we drinking today  
 
 
 

#worldcup2022


In [46]:
#remove hastags from Tweet Clean
df['Tweet Clean'] = df['Tweet Clean'].str.replace(r'#[\w]*', '', regex=True,flags=re.MULTILINE).str.strip()

df

Unnamed: 0,Tweet,Sentiment,Tweet Lower,Tweet Clean
0,What are we drinking today @TucanTribe \n@MadB...,neutral,what are we drinking today @tucantribe \n@madb...,what are we drinking today
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive,amazing @canadasocceren #worldcup2022 launch ...,amazing launch video. shows how much the fa...
2,Worth reading while watching #WorldCup2022 htt...,positive,worth reading while watching #worldcup2022 htt...,worth reading while watching
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive,golden maknae shinning bright\n\nhttps://t.co/...,golden maknae shinning bright
4,"If the BBC cares so much about human rights, h...",negative,"if the bbc cares so much about human rights, h...","if the bbc cares so much about human rights, h..."
...,...,...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive,here we go world cup 2022 #worldcup2022,here we go world cup 2022
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral,anderlecht confirms former viborg ff's jesper ...,anderlecht confirms former viborg ff's jesper ...
22521,Great thread to read before the start of #Worl...,positive,great thread to read before the start of #worl...,great thread to read before the start of
22522,Raphinha wants Brazil to be united at the #Wor...,positive,raphinha wants brazil to be united at the #wor...,raphinha wants brazil to be united at the 👊🇧🇷


In [47]:
#verify mentions have been removed
print(df.iloc[0]['Tweet Clean'])

what are we drinking today


In [48]:
#Create the list tokenized_words by applying the function word_tokenize to the values of the column Tweet Clean
tokenized_tweets = [word_tokenize(x) for x in df['Tweet Clean']]
tokenized_tweets[:5]

[['what', 'are', 'we', 'drinking', 'today'],
 ['amazing',
  'launch',
  'video',
  '.',
  'shows',
  'how',
  'much',
  'the',
  'face',
  'of',
  'canada',
  'and',
  'our',
  'men',
  '’',
  's',
  'national',
  'team',
  'have',
  'changed',
  'since',
  'our',
  'last',
  'world',
  'cup',
  'entry',
  'in',
  '1986.',
  'can',
  '’',
  't',
  'wait',
  'to',
  'see',
  'these',
  'boys',
  'in',
  'action',
  '!',
  'this',
  'is',
  'canada',
  ':',
  'fifa',
  'world',
  'cup',
  'opening',
  'video'],
 ['worth', 'reading', 'while', 'watching'],
 ['golden', 'maknae', 'shinning', 'bright'],
 ['if',
  'the',
  'bbc',
  'cares',
  'so',
  'much',
  'about',
  'human',
  'rights',
  ',',
  'homosexual',
  'rights',
  ',',
  'and',
  'women',
  'rights',
  'then',
  'why',
  'not',
  'say',
  'these',
  'before',
  'the',
  'opening',
  'ceremony',
  '?',
  '?',
  'why',
  'are',
  'they',
  'saying',
  'these',
  'during',
  'the',
  'opening',
  'ceremony',
  '?',
  '?',
  'why',
 

In [49]:
#filter stop words
nltk.download('stopwords')
stop_words_english = set(stopwords.words('english'))
filtered_tokenized_tweets = [
    [word for word in tweet if word not in stop_words_english]
    for tweet in tokenized_tweets
]
filtered_tokenized_tweets[:5]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['drinking', 'today'],
 ['amazing',
  'launch',
  'video',
  '.',
  'shows',
  'much',
  'face',
  'canada',
  'men',
  '’',
  'national',
  'team',
  'changed',
  'since',
  'last',
  'world',
  'cup',
  'entry',
  '1986.',
  '’',
  'wait',
  'see',
  'boys',
  'action',
  '!',
  'canada',
  ':',
  'fifa',
  'world',
  'cup',
  'opening',
  'video'],
 ['worth', 'reading', 'watching'],
 ['golden', 'maknae', 'shinning', 'bright'],
 ['bbc',
  'cares',
  'much',
  'human',
  'rights',
  ',',
  'homosexual',
  'rights',
  ',',
  'women',
  'rights',
  'say',
  'opening',
  'ceremony',
  '?',
  '?',
  'saying',
  'opening',
  'ceremony',
  '?',
  '?',
  'bbc',
  'censor',
  'opening',
  'ceremony',
  '?',
  '?']]

In [50]:
#Performing Sentiment Analysis
#Testing and downloading Vader
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores("DataWars is awesome! I love it so much!")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'neg': 0.0, 'neu': 0.36, 'pos': 0.64, 'compound': 0.8715}

In [51]:
cleaned_tweets = [' '.join(tweet) for tweet in filtered_tokenized_tweets]
cleaned_tweets

['drinking today',
 'amazing launch video . shows much face canada men ’ national team changed since last world cup entry 1986. ’ wait see boys action ! canada : fifa world cup opening video',
 'worth reading watching',
 'golden maknae shinning bright',
 'bbc cares much human rights , homosexual rights , women rights say opening ceremony ? ? saying opening ceremony ? ? bbc censor opening ceremony ? ?',
 "like , mexican fans able scream `` puto '' ? homophobic qatar ?",
 'look like jamaican football team naw follow worldcup',
 'really ? football monday morning 9 12 3 ? need pinch . really happening ?',
 'world cup starts qatar , ’ black awareness day brazil✊🏽 . despite atrocities linked year ’ host fifa , soccer fundamental lower classes , mostly black brown , vini jr identifies',
 '& amp ; go together well ‘ hungry love ’ \U0001faf6 ⚽️👌',
 'tried help useless soccer team 5 minutes inaugural game , denying undoubtedly perfectly good goal world cup host',
 'happy night $ pfl',
 'let ’ go

In [52]:
#Apply Vader to all words
tweet_sentiment_scores = [analyzer.polarity_scores(x)
 for x in cleaned_tweets]

In [53]:
tweet_sentiment_scores

[{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.0, 'neu': 0.864, 'pos': 0.136, 'compound': 0.6239},
 {'neg': 0.0, 'neu': 0.513, 'pos': 0.487, 'compound': 0.2263},
 {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404},
 {'neg': 0.13, 'neu': 0.739, 'pos': 0.13, 'compound': 0.0},
 {'neg': 0.226, 'neu': 0.59, 'pos': 0.184, 'compound': -0.1431},
 {'neg': 0.0, 'neu': 0.737, 'pos': 0.263, 'compound': 0.3612},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.087, 'neu': 0.913, 'pos': 0.0, 'compound': -0.296},
 {'neg': 0.0, 'neu': 0.442, 'pos': 0.558, 'compound': 0.743},
 {'neg': 0.2, 'neu': 0.423, 'pos': 0.377, 'compound': 0.6808},
 {'neg': 0.0, 'neu': 0.351, 'pos': 0.649, 'compound': 0.5719},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'compound': -0.5994},
 {'neg': 0.0, 'neu': 0.301, 'pos': 0.699, 'compound': 0.8519},
 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0},
 {'neg': 0.176, 'neu'

In [54]:
#Calculate the sentiment of each tweet based on the following rule analyzer.polarity_scores compound
def get_sentiment(res):
    comp = res['compound']
    if comp > .05:
        return "positive"
    elif comp < -.05:
        return "negative"
    return "neutral"

tweet_sentiment_results = [get_sentiment(score) for score in tweet_sentiment_scores]
tweet_sentiment_results[:5]

['neutral', 'positive', 'positive', 'positive', 'neutral']

In [55]:
#drop Tweet Lower and Tweet Clean, store results of tweet_sentiment_analysis in new column
df.drop(columns=['Tweet Lower','Tweet Clean'], inplace=True)
df = pd.concat([df, pd.Series(tweet_sentiment_results, name='Calculated Sentiment')], axis=1)
df

Unnamed: 0,Tweet,Sentiment,Calculated Sentiment
0,What are we drinking today @TucanTribe \n@MadB...,neutral,neutral
1,Amazing @CanadaSoccerEN #WorldCup2022 launch ...,positive,positive
2,Worth reading while watching #WorldCup2022 htt...,positive,positive
3,Golden Maknae shinning bright\n\nhttps://t.co/...,positive,positive
4,"If the BBC cares so much about human rights, h...",negative,neutral
...,...,...,...
22519,Here We go World cup 2022 #WorldCup2022,positive,neutral
22520,Anderlecht confirms former Viborg FF's Jesper ...,neutral,positive
22521,Great thread to read before the start of #Worl...,positive,positive
22522,Raphinha wants Brazil to be united at the #Wor...,positive,positive


In [56]:
#How many tweets were incorrectly classified
(df['Sentiment'] != df['Calculated Sentiment']).sum()

9508

In [57]:
#How many negative tweets were incorrectly classified
((df['Sentiment'] == 'negative') & (df['Calculated Sentiment'] != 'negative')).sum()


3026