### **The Traitors UK Analysis**

In [None]:
!pip3 -q install snscrape

In [None]:
!pip install neattext

In [None]:
#Import Libraries
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import textblob
from textblob import TextBlob
import nltk as nlp
nlp.download('all')
nlp.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import re
import requests
import string
import neattext as nt
import neattext.functions as nfx
import matplotlib.pyplot as plt
from collections import Counter




**Data Gathering**

In [None]:
#Create scraper object with keywords 
scraper = sntwitter.TwitterSearchScraper("#TheTraitorsUK lang:en until:2022-12-30 since:2022-11-03")

tweets = []

for i, tweet in enumerate(scraper.get_items()):
  data = [
      tweet.id,
      tweet.date,
      tweet.content,
      tweet.user.location,
      tweet.likeCount,
      tweet.retweetCount
  ]
  tweets.append(data)
  if i>10:
    break

#Create DataFrame
tweet_df = pd.DataFrame(tweets,
                        columns = ['tweet_id','date','TweetText','location','likes','retweets'])

tweet_df.head()

In [None]:
#Check data shape
tweet_df.shape

(5152, 6)

### Data Exploration

In [None]:
# Define function to extract hashtags 
def getHashtags(tweet):
    tweet = tweet.lower()  
    tweet = re.findall(r'\#\w+',tweet) 
    return " ".join(tweet)

tweet_df['Hashtags'] = tweet_df['TweetText'].apply(getHashtags)
tweet_df

In [None]:
#Get the count of hashtags
hashtagPattern = re.compile(r'#(\w+)')


hashtags_list = tweet_df['Hashtags'].tolist()

hashtags = []
for item in hashtags_list:
    item = item.split()
    for i in item:
        hashtags.append(i)

words = [word for ht in hashtags for word in hashtagPattern.findall(ht)]

counted = Counter(words)
hashtags_df = pd.DataFrame.from_dict(counted, orient='index').reset_index()
hashtags_df.columns = ['Hashtags', 'Count']
hashtags_df.sort_values(by='Count', ascending=False, inplace=True)
hashtags_df

In [None]:
#Extract Contestants
cast =['amos', 'maddy', 'fay', 'ivan', 'john', 'theo', 'kieran', 'andrea','wilf','wilfred', 
          'meryl', 'alyssa', 'tom', 'aisha', 'imran', 'alex', 'claire', 'nicky', 'matt', 'amanda', 'rayan','hannah', 'aaron']



def getcast(tweet):
    tweet = tweet.lower() 
    tweet_tokens = word_tokenize(tweet)
    castM = [char for char in tweet_tokens if char in cast] 
    return " ".join(castM)

# Extract casts to a new column
tweet_df['Traitors_Cast'] = tweet_df['TweetText'].apply(getcast)
tweet_df

In [None]:
#Count Cast member references
cast_list = tweet_df['Traitors_Cast'].tolist()


cast = []
for item in cast_list:
    item = item.split()
    for i in item:
        cast.append(i)


counts = Counter(cast)
cast_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
cast_df.columns = ['Traitors_Cast', 'Count']
cast_df.sort_values(by='Count', ascending=False, inplace=True)
cast_df.head(10)

**Data Cleaning**


In [None]:
#Data Cleaning
def cleanTweets(twt):
  twt = twt.lower()
  twt = re.sub('RT', '',twt) #Remove RT
  twt = re.sub(r"http\S+|www\S+|https\S+", '', twt, flags = re.MULTILINE) #Remove hyperlinks
  twt = re.sub('\\n','',twt) #Remove '\n' character
  twt = re.sub(r'\#\w+','',twt) #Remove hashtags
  twt = re.sub(r'\@\w+|\#\w+|\d+', '', twt)
  twt = re.sub('[^a-zA-Z]', '', twt) #remove all other letters except alphabets and numbers
  twt = re.sub('@[\S]*','',twt) #Remove @mentions
  twt = re.sub('^[\s]+|[\s]+$','',twt) #Remove leading and trailing white spaces
  twt = re.sub("\'", '', twt) #remove single quotes
  twt = re.sub('"', '', twt) #remove double quotes
  twt = re.sub('[()!?]', ' ', twt) #remove punctuations
  twt = re.sub('\[.*?\]',' ', twt) #remove punctuations
  return twt

In [None]:
#Remove Emojis
def strip_emoji(twt):
    RE_EMOJI = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
    return RE_EMOJI.sub(r'', twt)

In [None]:
#Create Column for Cleaned Tweets
tweet_df['Cleaned_Tweets'] = tweet_df['TweetText'].apply(cleanTweets)
tweet_df['Cleaned_Tweets'] = tweet_df['TweetText'].apply(strip_emoji)
tweet_df

In [None]:
#Noise scan
tweet_df['Cleaned_Tweets'].apply(lambda x: nt.TextFrame(x).noise_scan()['text_noise'])

In [None]:
#Extract Stopwords
tweet_df['Cleaned_Tweets'].apply(lambda x: nt.TextExtractor(x).extract_stopwords(lang='en'))

In [None]:
#List Stopwords
stop_words = ['co','s','t','n',"they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
                   "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
                   "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", 
                   "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", 
                   "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
                   "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
                   "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", 
                   "own", "same", "so", "than", "too", "very","can", "will", "just","should",
                   "now",'anyone','today','yesterday','day', 'already',"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", 
                   "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
                   'traitor','thetraitor','faithful','u', 'uk','bbc','traitors','thetraitors',
                   'thetraitorsuk']
letters = list(string.ascii_lowercase)
all_stopwords = stop_words + letters

### Data Preprocessing

In [None]:
#Data PreProcessing
def processedTweets(tweet):
  tweet = tweet.lower()
  #Remove Stopwords and punctuation
  tweet_tokens = word_tokenize(tweet)
  filtered_tokens = [t for t in tweet_tokens if t not in all_stopwords]
  unpunct_tokens = [t for t in filtered_tokens if t not in string.punctuation]
  lemmatizer = WordNetLemmatizer()
  lemma_tokens = [lemmatizer.lemmatize(t) for t in unpunct_tokens]
  return ' '.join(lemma_tokens)


In [None]:
#Apply processed tweets to dataframe
tweet_df['Processed_Tweets'] = tweet_df['Cleaned_Tweets'].apply(processedTweets)
tweet_df

In [None]:
#Get full content
Fulltweet = tweet_df['Processed_Tweets'].tolist()
Fulltweet = ' '.join(Fulltweet)
tweet_df

In [None]:
#Noise scan
tweet_df['Processed_Tweets'].apply(lambda x: nt.TextFrame(x).noise_scan()['text_noise'])

In [None]:
#Drop non-required columns
tweet_df.drop(['TweetText','Cleaned_Tweets'], axis=1, inplace=False)
tweet_df.head()

### Sentiment Analysis

In [None]:
#Polarity Score
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

#Sentiment category
def getSentimentTextBlob(polarity):
    if polarity < 0:
        return "Negative"
    elif polarity == 0:
        return "Neutral"
    else:
        return "Positive"

In [None]:
#Create columns for Polarity Score and Sentiment
tweet_df['Polarity']=tweet_df['Processed_Tweets'].apply(getPolarity)
tweet_df['Sentiment']=tweet_df['Polarity'].apply(getSentimentTextBlob)
tweet_df['Sentiment'].value_counts()
tweet_df.head()

In [None]:
#Save file to Excel
tweet_df.to_csv('TheTraitors.csv')

In [None]:
#Create WordCloud
wc= WordCloud(collocations = False,max_words=500, background_color = 'black')
wc.generate(Fulltweet)
 
# plot the WordCloud image                      
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wc)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
wc.to_file('wordcloud.png')

<wordcloud.wordcloud.WordCloud at 0x7f8f06c7bc10>

### Conclusion
The exported data frame will be used in Tableau to create a dashboard and visualise results.
