In [2]:
## 1. Prepare (data for exploration)
#import all necessary libraries
import snscrape.modules.twitter as sntwitter
import pandas as pd
import os
import re
from collections import Counter
import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

In [None]:
#list of airing dates for all 10 episodes
episode_dates = (
    "2022-08-22", "2022-08-29", "2022-09-05", "2022-09-12", "2022-09-19",
    "2022-09-26", "2022-10-03", "2022-10-10", "2022-10-17", "2022-10-24"
                )
#list all necessary variables
tweets = []
limit = 10000
eps = 0

In [None]:
#Scraping relevant tweets using snscrape from twitter
for episode_date in episode_dates:
    #description of criteria to query twitter data
    query = '(#hod OR #houseofdragons OR #houseofthedragon OR "house of dragon" OR "house of the dragon")'
    query += 'lang:en' " "'until:'+episode_date

    for tweet in sntwitter.TwitterSearchScraper(query).get_items():
        # print(vars(tweet))
        # break
        if len(tweets) == limit:
            break
        else:
            tweets.append(
                [tweet.date, tweet.user.username, tweet.user.location, 
                tweet.content, tweet.replyCount, tweet.retweetCount, tweet.likeCount]
                )
        
    df = pd.DataFrame(tweets, columns=['Date', 'User', 'Location', 'Tweet', 'Replies', 'RTs', 'Likes'])
    eps = eps + 1

    # to save to csv
    df.to_csv(f'hod_tweets_E0{eps}_{episode_date}.csv', index = False)
    tweets.clear()

In [None]:
#combine all 10 CSV files into a single CSV file Master_HoD_dataset.csv
master_df = pd.DataFrame()

for file in os.listdir(os.getcwd()):
	if file.endswith(".csv"):
		master_df = master_df.append(pd.read_csv(file))

master_df.to_csv("Master_HoD_dataset.csv", index = False)

In [3]:
#read Master_HoD_dataset.csv into dataframe
df = pd.read_csv("C:/Users/kwame/Documents/twitter_analysis/Master_HoD_dataset.csv")

In [4]:
## 2. Process (data from dirty to clean
#Check first five rows
df.head()

Unnamed: 0,Date,User,Location,Tweet,Replies,RTs,Likes
0,2022-10-23 23:59:59+00:00,ItstWOTterTime,,"Toddler put to bed by the husband, kitchen cle...",0,0,4
1,2022-10-23 23:59:58+00:00,CustardCream02,UK,How do people ever support this psychopath wom...,0,0,1
2,2022-10-23 23:59:51+00:00,Melmaxxx,Rent free,House of the dragon time.....,0,0,0
3,2022-10-23 23:59:50+00:00,GoonerMcc,,Only just caught up on the penultimate episode...,0,0,0
4,2022-10-23 23:59:44+00:00,MohamadHowar,Kingdom of Saudi Arabia,@HouseofDragon The episode title “by accident”...,0,0,0


In [5]:
#Inspect DataFrame
df.shape

(100000, 7)

In [6]:
#Inspect DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   Date      100000 non-null  object
 1   User      100000 non-null  object
 2   Location  69757 non-null   object
 3   Tweet     100000 non-null  object
 4   Replies   100000 non-null  int64 
 5   RTs       100000 non-null  int64 
 6   Likes     100000 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 5.3+ MB


In [7]:
#Check for missing values
df.isna()

Unnamed: 0,Date,User,Location,Tweet,Replies,RTs,Likes
0,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
99995,False,False,True,False,False,False,False
99996,False,False,False,False,False,False,False
99997,False,False,True,False,False,False,False
99998,False,False,True,False,False,False,False


In [8]:
#Split time from date in the "Date" column
df["Date"]= df["Date"].str[0:10]

In [9]:
#indentifier to remove "#" from hashtags
pattern = r'#\w+'

#function for identifying and isolating all hashtags
def getHashtags(tweet):
    tweet = tweet.lower()
    tweet = re.findall(pattern, tweet)
    return " ".join(tweet)
        
#crate new column to contain hashtags
df["Hashtags"] = df["Tweet"].apply(getHashtags)

    

In [10]:
#convert "Hashtags" data frame into list
Hashtags = df["Hashtags"].tolist()

new_Hashtags = []

#split hashtags in the "Hashtags" list 
for hashtag in Hashtags:
    hashtag = hashtag.split()
    for each_item in hashtag:
        new_Hashtags.append(each_item)
        
#Find the number of times each hashtag was used and read to new dataframe
hashtags_count = Counter(new_Hashtags)
df_hashtags = pd.DataFrame.from_dict(hashtags_count, orient = "index").reset_index()

#create columns for df_hashtags dataframe and sort from highest to lowest
df_hashtags.columns = ["Hashtags", "Count"]
df_hashtags.sort_values(by = "Count", ascending = False, inplace = True)



In [11]:
#Identify top 5 trends of the show
df_hashtags.head()

Unnamed: 0,Hashtags,Count
0,#houseofthedragon,38486
1,#hotd,5125
2,#houseofthedragonhbo,4484
5,#gameofthrones,2647
3,#tvtime,2506


In [12]:
#Finding the number of hashtags used to tweet about the show
df_hashtags.shape

(9621, 2)

In [13]:
#define list of cast and dragons on the show with possible mispellings
Cast = ["rhaenyra", "rhanyra", "rhenyra", "raenyra", "renyra", "rhenera" "ranyra", "raenys", "renys", "ranys", 
        "rhanys", "rhenys", "rhaenys", "daemon", "damon", "alicent", "alisent", "aliscent", "viserys", "vicerys"
        "viscerys", "aemond", "amond", "criston", "otto", "aegon", "agon", "corlys", "laenor", "lenor" 
        "lanor", "larys", "vaemond", "vemond", "vamond"
        ]

Dragons = ["syrax", "cyrax", "sirax", "cirax" "caraxes", "karaxes", "seasmoke", "sea smoke", "sea-smoke", "Seasmoke",
           "arrax", "arax", "vhagar", "vhegar", "vagar", "vegar", "vhaegar" "meleys", "maelys", "malys"
            "vermax", "arrax", "vermithor"
          ]

In [14]:
#function for extracting cast of the show
def getCast(tweet):
    tweet = tweet.lower()
    tokenized_tweet = word_tokenize(tweet)
    cast = [char for char in tokenized_tweet if char in Cast]
    return " ".join(cast)
        
#function for extracting dragons of the show
def getDragons(tweet):
    tweet = tweet.lower()
    tokenized_tweet = word_tokenize(tweet)
    dragon = [char for char in tokenized_tweet if char in Dragons]
    return " ".join(dragon)

#create new column to contain cast
df["Cast"] = df["Tweet"].apply(getCast)

#create new column to contain dragons
df["Dragons"] = df["Tweet"].apply(getDragons)



In [15]:
#convert "Cast"  & "Dragons" data frame into list
cast_list = df["Cast"].tolist()
dragon_list = df["Dragons"].tolist()

#split cast and dragons in the "Cast" and "Dragons" lists
all_cast_list = []
for cast in cast_list:
    cast = cast.split()
    for mention in cast:
        all_cast_list.append(mention)
        
all_dragons_list = []
for dragon in dragon_list:
    dragon = dragon.split()
    for mention in dragon:
        all_dragons_list.append(mention)

In [16]:
#dictionary of mispelled Cast names and their correct spelling
Cast_names_dict = {
                            "Rhaenyra": ["rhaenyra", "rhanyra", "rhenyra", "raenyra", "renyra", "rhenera", "ranyra"],
                            "Rhaenys": ["raenys", "renys", "ranys", "rhanys", "rhenys", "rhaenys"],
                            "Daemon": ["daemon", "damon"],
                            "Alicent": ["alicent", "alisent", "aliscent"],
                            "Viserys": ["viserys", "vicerys", "viscerys"],
                            "Aemond": ["aemond", "amond"],
                            "Aegon": ["aegon", "agon"],
                            "Laenor": ["laenor", "lenor", "lanor"],
                            "Vaemond": ["vaemond", "vemond", "vamond"],
                            "Otto": ["otto"],
                            "Larys": ["larys"],
                            "Criston": ["criston"],
                            "Corlys": ["corlys"],
                      }

#dictionary of mispelled dragon names and their correct spelling
Dragons_names_dict = {
                            "Syrax": ["syrax", "cyrax", "sirax", "cirax"],
                            "Caraxes": ["caraxes", "karaxes"],
                            "Seasmoke": ["seasmoke", "sea smoke", "sea-smoke", "seasmoke"],
                            "Arrax": ["arrax", "arax"],
                            "Vhagar": ["vhagar", "vhegar", "vagar", "vegar", "vhaegar"],
                            "Meleys": ["meleys", "maelys", "malys"],
                            "Vermithor": ["vermithor", "vemithor"],
                            "Vermax": ["vermax"],
                      }

#loop through list of all cast names (including mispelled ones) and replace allnames with the appropriate 
#name (keys) in the Cast_names_dict dictionary
for name in range(len(all_cast_list)):
    for correct, wrong in Cast_names_dict.items():
        for each_name in wrong:
            if all_cast_list[name] == each_name:
                all_cast_list[name] = correct


#loop through list of all cast names (including mispelled ones) and replace allnames with the appropriate 
#name (keys) in the Cast_names_dict dictionary
for dragon in range(len(all_dragons_list)):
    for correct, wrong in Dragons_names_dict.items():
        for each_name in wrong:
            if all_dragons_list[dragon] == each_name:
                all_dragons_list[dragon] = correct

In [17]:
## 3. Analyze (data to answer questions)
#Find the number of times each cast member/dragon was mentioned and read to new dataframe
cast_count = Counter(all_cast_list)
df_Cast = pd.DataFrame.from_dict(cast_count, orient = "index").reset_index()

dragons_count = Counter(all_dragons_list)
df_Dragons = pd.DataFrame.from_dict(dragons_count, orient = "index").reset_index()

#create columns for df_Cast/df_Dragons dataframes and sort from highest to lowest
df_Cast.columns = ["Cast", "Count"]
df_Cast.sort_values(by = "Count", ascending = False, inplace = True)

df_Dragons.columns = ["Dragons", "Count"]
df_Dragons.sort_values(by = "Count", ascending = False, inplace = True)

In [18]:
#identify top 5 most talked about characters on the show
df_Cast.head()

Unnamed: 0,Cast,Count
7,Rhaenyra,3755
0,Daemon,3131
8,Alicent,1824
6,Viserys,1480
1,Aemond,772


In [19]:
#identify top 5 most talked about dragons on the show
df_Dragons.head()

Unnamed: 0,Dragons,Count
1,Vhagar,411
2,Syrax,175
4,Seasmoke,87
0,Arrax,67
3,Vermithor,19


In [20]:
del df["Location"]
del df["Hashtags"]
del df["Cast"]
del df["Dragons"]

In [21]:
#Find the number of times each user tweeted
user_list = df["User"].tolist()
user_count = Counter(user_list)
df_User = pd.DataFrame.from_dict(user_count, orient = "index").reset_index()

#create columns for df_Users dataframe and sort from highest to lowest
df_User.columns = ["User", "Count"]
df_User.sort_values(by = "Count", ascending = False, inplace = True)

In [22]:
df_User.head()

Unnamed: 0,User,Count
2536,bigdogXVI,690
26,tvsotherworlds,267
882,UomoeleganteIt,179
5686,oochotd,172
2355,Matt_SmithNews,168


In [23]:
#save all final files for analysis on tableau
df.to_csv("Hod_Final_dataset.csv", index = False)
df_hashtags.to_csv("Hod_Hashtags.csv", index = False)
df_Cast.to_csv("Hod_Cast.csv", index = False)
df_Dragons.to_csv("Hod_Dragons.csv", index = False)
df_User.to_csv("Hod_Tweeters.csv", index = False)