# An Analysis of Twitter User's Perception of Harry and Meghan Documentary Series

<a id = 'introduction'></a>

### 1. Introduction

Harry and Meghan documentary series explores the early days of the courtship between the Duke and Duchess of Sussex and the events that took place that led them feeling forced to relinquish their duties in the royal.
I carried out a sentiment analysis to measure the perception of Twitter users and the conversation about the documentary series using Twitter's API and Python library, Tweepy to collect the tweets.

### Contents

<ol>
    <li><a href='#introduction'>Introduction</a></li>
    <li><a href='#data_collection'>Data Collection</a></li>
    <li><a href='#data_preprocessing'>Data Preprocessing</a></li>
    <li><a href='#sentiment_analysis'>Sentiment Analysis</a></li>
    <li><a href='#sentiment_visualisation'>Sentiment Visualisations</a></li>
</ol>

<a id = 'data_collection'></a>

### 2. Data Collection

In [4]:
# Import the libraries

import tweepy 
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob
import pandas as pd
import numpy as np
import re
import csv
import matplotlib.pyplot as plt
from advertools.emoji import EMOJI
plt.style.use('fivethirtyeight')

from collections import Counter

import nltk
from nltk.corpus import stopwords

In [None]:
# Twitter API Credentials

apiKey = '########################'

apiKeySecret = '##################################################'

accessToken = 'XXXXXXXXX-########################################'

accessTokenSecret = '#############################################'

In [None]:
# create the authentication object
authenticate = tweepy.OAuthHandler(apiKey, apiKeySecret)

# set accesstoken and accesstokensecret
authenticate.set_access_token(accessToken, accessTokenSecret)


#create the API object while passing in the auth information
api = tweepy.API(authenticate, wait_on_rate_limit=True) #sleeps when API limit is reached
sleep_on_rate_limit=False

In [None]:
# Create a function to download
tweet_data = []

def get_tweets(search_query, limit_of_tweet):
    tweets = tweepy.Cursor(api.search_tweets, q=search_query, lang = 'en', count = 100, tweet_mode= 'extended').items(limit_of_tweet)
    for tweet in tweets:
        data.append([
        tweet.id,  # User id
        tweet.created_at,       # date and time of tweet
        tweet.user.screen_name, # username
        tweet.user.location,    # location of the user
        tweet.full_text,        # tweet
        tweet.retweet_count,    # number of retweets on the tweet
        tweet.favorite_count,   # number of likes on the tweet
        ])      
        
# Hashtags to look out for,filter out retweets and replies
queryTopic = '#HarryAndMeghanNetfix OR #HarryAndMeghanNetflix OR #MeghanandHarryNetflix OR #HarryandMeghanonNetflix OR #MeghanandHarryonNetflix OR #HarryandMeghan OR #MeghanandHarry OR #PrinceHarry OR #MeghanMarkle'
searchQuery = queryTopic + " -filter:retweets AND -filter:replies" 

#  Pass in the paramters
get_tweets(searchQuery, 50000)  

# Name each column of the dataframe
tweet_df = pd.DataFrame(tweet_data, columns = ['Id', 'Date_time_of_time', 'Username', 'Location', 'Tweet',  'Retweet', 'Like']) 

In [None]:
# Print the head of the dataframe
print(tweet_df)

In [None]:
# Define a function to extract hashtags and remove #
def getHashtags(tweet):
    # make the lowercase of the tweets
    tweet = tweet.lower()  
    # Search all words that starts a hashtag
    tweet = re.findall(r'\#\w+',tweet) 
    return " ".join(tweet)

# Extract the hashtags by applying the function to the Tweet column 
tweet_df['Hashtag'] = tweet_df['Tweet'].apply(getHashtags)
tweet_df.head()

In [None]:
# Convert the data elements into a list
hashtag_list = tweet_df['Hashtag'].tolist()

# Split the hashtags into seperate rows where there is more than one hashtag
hashtag = []
for item in hashtag_list:
    item = item.split()
    for i in item:
        hashtag.append(i)
        
# Determine count of all hashtags used
counts = Counter(hashtag)
hashtag_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
hashtag_df.columns = ['Hashtag', 'Count']
hashtag_df.sort_values(by='Count', ascending=False, inplace=True)

In [None]:
# Save the dataframe as a csv file
tweet_df.to_csv('HarryandMeghan.csv')

In [None]:
# Read in the file
tweet_df = pd.read_csv('HarryandMeghan.csv')

In [None]:
# Print the head of the dataframe
tweet_df.head()

<a id = 'data_preprocessing'></a>

### 3. Data Preprocessing

In [None]:
# Inspect dataframe
tweet_df.shape

In [None]:
# Check for missing values
tweet_df.isnull().sum()

In [None]:
# Fill missing locations with "Unspecified"
tweet_df["Location"] = tweet_df["Location"].fillna('Unspecified')
tweet_df.head()

In [None]:
# Confirm that the missing locations are filled
tweet_df["Location"].isnull().sum()

In [None]:
# Check for duplicates
tweet_df.duplicated(subset='Id').sum()

In [None]:
# Drop unnecessary columns 
tweet_df.drop(tweet_df.columns[[0, 1]], axis=1, inplace=True)

In [None]:
# Print the head of the dataframe
tweet_df.head()

In [None]:
# Define stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
# Instantiate emoji
EMOJI_PATTERN = EMOJI

In [None]:
# Create a function to clean the tweets
def clean_text(text):
    
    text = text.lower()
    # Remove @ mentions
    text = re.sub(r'@[A-Za-z0-9]+', '', text) 
    # Remove symbols
    text = re.sub(r'[#:_\!/]', '', text)
    # Remove emojis
    text = re.sub(EMOJI_PATTERN, r'', text)
    # Remove punctuations
    text = re.sub('[()!?]', ' ', text)
    text = re.sub('\[.*?\]',' ', text)  
    # Remove hyperlinks.
    text = re.sub(r'https?:\/\/\S+', '', text)   
    text = re.sub(r'www.\S+', '', text)
    text = re.sub(r'httpst', '', text)
    # Remove stopwords 
    words = [word for word in text.split() if not word in stop_words]
    
    return " ".join(words)
    
    
# Cleaning the text
tweet_df['Tweet'] = tweet_df['Tweet'].apply(clean_text)

In [None]:
# Show the head of the clean dataframe
tweet_df.head()

<a id = 'sentiment_analysis'></a>

### 4. Sentiment Analysis

In [None]:
# Create a function to obtain the subjecivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to obtain the polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

# Create the subjectivity and polarity columns
tweet_df['Subjectivity'] = tweet_df['Tweet'].apply(getSubjectivity)
tweet_df['Polarity'] = tweet_df['Tweet'].apply(getPolarity)

In [None]:
# Show the new dataframe with the new columns
tweet_df.head()

In [None]:
# Create a function to calculate the negative, neutral, positive sentiments
def getSentimentAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
# Create a sentiment column and append it to the original dataframe
tweet_df['Sentiment'] = tweet_df['Polarity'].apply(getSentimentAnalysis)

In [None]:
# Show the new dataframe with the new column
tweet_df.head()

<a id = 'sentiment_visualisation'></a>

### 5. Sentiment Visualisations

In [None]:
# Create new data frames for all sentiments 
positive_tweet = tweet_df[tweet_df["Sentiment"]=="Positive"]
neutral_tweet = tweet_df[tweet_df["Sentiment"]=="Neutral"]
negative_tweet = tweet_df[tweet_df["Sentiment"]=="Negative"]

In [None]:
# Create a function count the sentiments and calculate the percentage in a single dataframe
def count_values(data,feature):
    total=data.loc[:,feature].value_counts(dropna=False)
    percentage=round(data.loc[:,feature].value_counts(dropna=False,normalize=True)*100, 2)
    return pd.concat([total,percentage],axis=1,keys=['Total','Percentage'])

# Count_values for sentiment
count_values(tweet_df,"Sentiment")

In [None]:
# Print positive tweets
ptweet = tweet_df[tweet_df.Sentiment == 'Positive']
ptweet = ptweet['Tweet']

ptweet.head(10)

In [None]:
# Plot a Pie Chart
pie_chart = count_values(tweet_df,"Sentiment")


names= pie_chart.index
size= pie_chart["Percentage"]

# Create a circle for the centre of the plot
my_circle=plt.Circle( (0,0), 0.7, color='white')
plt.pie(size, labels=names, colors=['black','cyan','blue'])
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.show()

In [None]:
# Show the value count
tweet_df['Sentiment'].value_counts()

# Plot and visualise the counts
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
tweet_df['Sentiment'].value_counts().plot(kind='bar')
plt.show()

In [None]:
#Instantiate the wordcloud using
everyword = ' '.join(tweet_df['Tweet'])
wordcloud = WordCloud(
    stopwords=STOPWORDS,
    background_color = 'black',
    width = 600,
    height = 400,
    collocations = False,
    max_words = 100,
    max_font_size = 400).generate(everyword)

plt.figure(figsize=(6,6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title('Word Cloud for Harry and Meghan Series')
plt.savefig("Harry_Meghan_wordcloud.png", format="png")

In [None]:
# Export the dataframes as csv files to build a dashboard with Power BI
tweet_df.to_csv('HarryandMeghan1.csv')

hashtag_df.to_csv('hashtags.csv')