In [None]:
# IF you have downloaded the 'example_twitter_data.pkl' but 
# don't have a file called 'example_twitter_data_unpacked.pkl', run this cell

import pandas as pd

def extract_original_tweets(df):
    rt_filter = ~df.retweeted_status.isna()
    retweet_df = pd.DataFrame( df[rt_filter]['retweeted_status'].tolist() )
    df = df.append(retweet_df).drop_duplicates('id').reset_index(drop=True)
    return df

temp = pd.read_pickle('example_twitter_data.pkl')

temp = extract_original_tweets(temp)
df_dicts = temp.to_dict(orient='records')
temp = pd.json_normalize(df_dicts)
temp.to_pickle('example_twitter_data_unpacked.pkl')


# SC207 - Session 7
# APIs - Exploring and Summarising Twitter Data
<img src="https://github.com/Minyall/sc207_materials/blob/master/images/tweepy.jpg?raw=true" align="right" width="300">


What kinds of exploratory analysis can we run on social media data? This session covers various examples of the kinds of insights that can be gathered through the analysis of social media data, and how to present those results.

[Tweepy Documentation](http://docs.tweepy.org/en/stable/)

## Section b) Exploring your data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def tweet_url(row):
    return f"https://twitter.com/{row['user.screen_name']}/status/{row['id']}"

In [None]:
df = pd.read_pickle('example_twitter_data_unpacked.pkl')
len(df)

In [None]:
# filter out retweets, so only original tweets are shown 
# - any retweets in our data would have had their original tweets extracted in the last session

original_tweets_filter = 
df = df[original_tweets_filter]

In [None]:
df.info()

## Explorations: Favourite Tweets
A simple one to begin with. Which tweets got the most 'favourites' or hearts?
Let's look at the top 10

In [None]:
subset = ['id','user.screen_name','user.followers_count','favorite_count','retweet_count','full_text']

top_favs = 

for index_number, row in top_favs.iterrows():
    
    print('*'*10)
    print("INDEX:", index_number)
    print("USER:", row['user.screen_name'])
    print("FAV:", row['favorite_count'])
    print("RT:", row['retweet_count'])
    print(row['full_text'])
    print(tweet_url(row))

If you prefer, you can always export subsets of your data like this one, to a csv file to easily view in Excel or similar.

In [None]:
top_favs

In [None]:
top_favs.to_csv('top_favs.csv')

## The Skewed World of Twitter Interactivity
Whilst there are approximately 6,000 tweets posted per second, the vast majority of them recieve little attention. Tweet interaction tends to skew heavily such that the majority have 0 retweets/favorites/replies.

If we look above we can see a pretty swift drop in the number of interactions across the top tweets.

This can make exploring our data from social media difficult, particularly around these metrics, as often graphs will skew heavily around 0 with a few points then reaching 100,000 + interactions.

Because of the scale of the figures we're looking at, Pandas may use *scientific notation*, which is used to express very big numbers in a condensed format.

<img src="https://github.com/Minyall/sc207_materials/blob/master/images/scientific_notation.png?raw=true" align="left" width="300">


In [None]:
# check the data's overall distribution on our subset of columns using describe


In [None]:
# use seaborn lmplot to see retweet count vs favourite count


fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

If we add some extra percentiles to `describe` we can see the values needed to enter...
 - 0.9: the top 10%
 - 0.99: the top 1%
 - 0.999: the top 0.1%

In [None]:
df[subset].describe(percentiles=[0.9,0.99,0.999])

From the numbers above we can compare whether favorites or retweets are more evenly distributed in this topic. We can visualise this difference using a `boxen` plot, a type of box plot that breaks the box up so that the width indicates the number of tweets that fall into that value range.

In [None]:
# We melt our data so that our data is in a shape that seaborn can understand for the plot we want
cat_plot_data = 
cat_plot_data

In [None]:
# use a seaborn cat plot to examine the distribution of favourite and retweet counts


fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

Because of the heavy skew there isn't even a clear trend when it comes to followers_count vs retweet count - which we might have expected.

In [None]:
# use a seaborn lmplot to examine user followers count vs retewet count

fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

We can check this to see which are the top tweets for retweet count, and which are top for follower count. It is not always the case that the highest follower counts mean the greatest interaction. Often it is accounts with pre-existing social capital that have high follower counts. This doesn't necessarily always result in high engagement with their content.

In [None]:
# check to see who has high retweet count by sorting and examining the top 10



In [None]:
# check to see who has high followrs_count count by sorting and examining the top 10




# #Hashtags
Examining the hashtags of your data can give you a sense of the discourses around a particular topic, and inform you of connectivity to other issues. The first step is to get the hashtags out of their nested data structure.

For each entry in `entities.hashtags` we see a list, which if it is not empty, contains a set of dictionaries, and one value in each dictionary, the `text` value, is what we actually want.

In [None]:
subset = ['id','entities.hashtags']

In [None]:
df[subset]

Each tweet contains a list, because each tweet could have one or more hashtags associated with it. The first step is to `.explode` the column, such that each hashtag gets its own row.

In [None]:
hashtag_data = 
hashtag_data

In [None]:
# We drop any rows without hashtags at all so that our data is just tweets with hashtags associated
not_empty = 
hashtag_data = 
hashtag_data

In [None]:
# if we examine one row... remember iloc indexes purely by row and 
# column position so first row, whatever its actual index number, is iloc[0]
example_row = hashtag_data.iloc[0]
example_row

In [None]:
# it is a dictionary, so we can get the 'text' by just using a key
example_row['entities.hashtags']['text']

Ultimately what we want is to be able to do that to each row and then save the result in a new column. Enter pandas `.apply`!

In [None]:
# First we create a function that does the job we want

def extract_entity(entity_dict, entity_key):
    return 

extract_entity(example_row['entities.hashtags'], 'text')

In [None]:
# Then we apply it to the column containing the data. We'll do it without assigning first just to check it works...



In [None]:
hashtag_data['tag'] = 
hashtag_data

In [None]:
# now we can ask how many times each tag is used...


In [None]:
# assign the value count, with a reset index and top 10 to plot_tag_data
plot_tag_data = 
plot_tag_data

In [None]:
# use a seaborn barplot for the plot_tag_data
plot = 


plot.set_xticklabels(plot.get_xticklabels(), 
                          rotation=90, 
                          horizontalalignment='right')
plt.title("Top 20 Hashtags")
plt.xlabel('Tag')
plt.ylabel('Freqency')

fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

As usual, all the above can be condensed down into a few lines, and/or a small function...

In [None]:
# We made this one earlier
def extract_entity(entity_dict, entity_key):
    return entity_dict[entity_key]


def extract_entities(df, entity_column, entity_key, new_column_name):
     
    entity_data = df.explode(entity_column).copy()
    
    not_empty = ~entity_data[entity_column].isna()
    entity_data = entity_data[not_empty]
    
    entity_data[new_column_name] = entity_data[entity_column].apply(extract_entity, entity_key=entity_key)
    return entity_data
    

In [None]:
hashtag_data = 

In [None]:
subset = ['id','entities.hashtags','tag']
hashtag_data[subset]

With the way we've designed our function, we can apply it to any entities column...

In [None]:
df['entities.user_mentions']

In [None]:
user_mentions = 

In [None]:
subset = ['id','entities.user_mentions','mentioned']
user_mentions[subset]

In [None]:
plot_data = user_mentions['mentioned'].value_counts().head(20).reset_index()

plot = sns.barplot(x='index', y='mentioned', data=plot_data)
plot.set_xticklabels(plot.get_xticklabels(), 
                          rotation=90, 
                          horizontalalignment='right')
plt.title("Top 20 Users Mentioned")
plt.xlabel('User')
plt.ylabel('Freqency')

fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

# Finding text
Sometimes it is useful to filter, or find Tweets based on their text content. For example in this debate, some mentioned John Bercow, as another case where bullying was mentioned. Let's create a variable that allows us to split our data based on whether Bercow is mentioned or not.

In [None]:
bercow_filter = df['full_text'].str.contains('bercow', case=False)

In [None]:
df['bercow_mentioned'] = bercow_filter

In [None]:
subset=['id','bercow_mentioned']
df[subset].groupby('bercow_mentioned').count().plot(kind='bar')

## Time Series of Tweets
Sometimes it is really useful to get a sense of the time distribution of tweets. We can use Time series information to...

- See trends such as peak times for particular topics
- Detect potential co-ordinated disinformation campaigns by examining...
  - the account creation date of all the accounts pushing a particular hashtag. Were a significant proportion of the accounts created in a small window of time?
  - the rate at which accounts are tweeting. Some accounts might tweets hundreds of times per hour - upwards of 50 is considered highly unusual.

To ensure Pandas understands that the information in a column is a date, we convert it into date format...

In [None]:
#transform to date objects
df['created_at'] = 


In [None]:
# By changing to a list of datetime objects pandas can now tell us more useful information, such as the earliest and latest date in the dataset
df['created_at']

In [None]:
# we can also filter it, such as asking for dates only after a certain point

date_filter = "2020-11-20"
recent_tweets = 

We then want to group our data into periods of time. There is no point grouping our data just on the 'created_at' column, because every time stamp will be slightly different by a second or two. Grouping by time needs a special object called a `Grouper`.

First we create a grouper. We provide it two arguments
- The `key` which is the column you want to group by
- The `freq` which specifies the time period you want to group by for example 'd' for day, or 'h' for hour, or 'min' for minute.
- You can see all the options for freq [here in the Pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases)


In [None]:
time_grouper = 
plot_data = 
plot_data

In [None]:
# use a seaborn relplot to plot the time series data, x=created_at, y='id'
plot = 

plt.title("Tweet Frequency by Hour")
plt.xlabel('Time')
plt.ylabel('Freqency')

fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

In [None]:
# we can also group by other values, such as the presence of the name 'bercow'
time_grouper = pd.Grouper(key='created_at', freq='h')
plot_data = recent_tweets.groupby(['bercow_mentioned',time_grouper]).count()['id'].reset_index()
plot_data.head()

In [None]:
plot = sns.relplot(x='created_at', y='id', hue='bercow_mentioned', kind='line', data=plot_data,ci=None)

plt.title("Tweet Frequency by Hour")
plt.xlabel('Time')
plt.ylabel('Freqency')

fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()

## A slightly more complex one putting together entities and time series

In [None]:
hashtag_data = extract_entities(recent_tweets, entity_column='entities.hashtags',entity_key='text', new_column_name='tag')

subset=['id','created_at','tag']
tag_time_data = hashtag_data[subset]
tag_time_data

In [None]:
top_five_tags = tag_time_data['tag'].value_counts().head(5).reset_index()['index'].tolist()
top_five_tags

In [None]:
top_filter = tag_time_data['tag'].isin(top_five_tags)
top_data = tag_time_data[top_filter]
top_data

In [None]:
time_grouper = pd.Grouper(key='created_at', freq='h')
plot_data = top_data.groupby(['tag',time_grouper]).count()['id'].reset_index()
plot_data

In [None]:
plot = sns.relplot(x='created_at', y='id', hue='tag', kind='line', data=plot_data,ci=None)

plt.title("Tweet Frequency by Hour")
plt.xlabel('Time')
plt.ylabel('Freqency')

fig = plt.gcf()
fig.set_size_inches(10, 5)
plt.show()