<img src="https://github.com/Minyall/sc207_materials/blob/master/images/gephi_network.png?raw=true" align="right" width="300">


# SC207 - Session 8
# Social Network Analysis with Gephi - EXTRA

This notebook demonstrates how to create a network of Tweets to entities. This allows us to later project a simplified graph of entity co-occurence in Gephi later.

In [1]:
import pandas as pd

In [2]:
def flatten_nested_dicts(df):
    dicts = df.to_dict(orient='records')
    flattened = pd.json_normalize(dicts)
    return flattened

In [None]:
df = pd.read_pickle('example_twitter_data.pkl')

In [5]:
subset = ['id','entities']

data = df[subset]
data.head()

Unnamed: 0,id,entities
0,1330137025148817408,"{'hashtags': [], 'symbols': [], 'user_mentions..."
1,1330137020711235586,"{'hashtags': [], 'symbols': [], 'user_mentions..."
2,1330137019679432707,"{'hashtags': [], 'symbols': [], 'user_mentions..."
3,1330137016881860610,"{'hashtags': [], 'symbols': [], 'user_mentions..."
4,1330137014180737025,"{'hashtags': [], 'symbols': [], 'user_mentions..."


In [6]:
data = flatten_nested_dicts(data)
data.head()

Unnamed: 0,id,entities.hashtags,entities.symbols,entities.user_mentions,entities.urls,entities.media
0,1330137025148817408,[],[],"[{'screen_name': 'Keir_Starmer', 'name': 'Keir...",[],
1,1330137020711235586,[],[],"[{'screen_name': 'mrjamesob', 'name': 'James O...",[],
2,1330137019679432707,[],[],"[{'screen_name': 'BorisJohnson', 'name': 'Bori...","[{'url': 'https://t.co/kaxOSjnOIu', 'expanded_...",
3,1330137016881860610,[],[],[],[],
4,1330137014180737025,[],[],"[{'screen_name': 'MarinaHyde', 'name': 'Marina...","[{'url': 'https://t.co/55QvxSnZbH', 'expanded_...",


In [8]:
hashtag_data = data[['id','entities.hashtags']]
hashtag_data.head()
# You may see a list of blank lists - this means there are no hashtags in those rows, but may be some
# in the dataset somewhere


Unnamed: 0,id,entities.hashtags
0,1330137025148817408,[]
1,1330137020711235586,[]
2,1330137019679432707,[]
3,1330137016881860610,[]
4,1330137014180737025,[]


In [11]:
hashtag_data = hashtag_data.explode('entities.hashtags').dropna()
hashtag_data.head()

Unnamed: 0,id,entities.hashtags
13,1330136987232301058,"{'text': 'Johnson', 'indices': [0, 8]}"
14,1330136978604560386,"{'text': 'BullyPatel', 'indices': [106, 117]}"
68,1330136850980286465,"{'text': 'LoanCharge', 'indices': [117, 128]}"
80,1330136822849134593,"{'text': 'BullyingisNEVERok', 'indices': [218,..."
89,1330136798501220362,"{'text': 'saynotobullying', 'indices': [102, 1..."


In [12]:
hashtag_data = flatten_nested_dicts(hashtag_data)
hashtag_data.head()

Unnamed: 0,id,entities.hashtags.text,entities.hashtags.indices
0,1330136987232301058,Johnson,"[0, 8]"
1,1330136978604560386,BullyPatel,"[106, 117]"
2,1330136850980286465,LoanCharge,"[117, 128]"
3,1330136822849134593,BullyingisNEVERok,"[218, 236]"
4,1330136798501220362,saynotobullying,"[102, 118]"


In [16]:
renaming = {'id':'source', 'entities.hashtags.text':'target'}

edges = hashtag_data[['id','entities.hashtags.text']].rename(columns=renaming)
edges.head()

# A tweet id to hashtag edge list 

Unnamed: 0,source,target
0,1330136987232301058,Johnson
1,1330136978604560386,BullyPatel
2,1330136850980286465,LoanCharge
3,1330136822849134593,BullyingisNEVERok
4,1330136798501220362,saynotobullying


In [19]:
edges['weight'] = 1
edges = edges.groupby(['source','target']).sum().reset_index()
edges['edge_type'] = 'tagged_with'

edges.head()

Unnamed: 0,source,target,weight,edge_type
0,1329691747597869061,KayBurley,1,tagged_with
1,1329691771757088768,Civilservice,1,tagged_with
2,1329691785254359042,BullyingAwarenessWeek,1,tagged_with
3,1329691810722181120,KayBurley,1,tagged_with
4,1329691875645775872,KayBurley,1,tagged_with


In [20]:
# Here as we ar not creating edges between user to user, but user to tag, we need to handle the
# production of the node list differently so that we can assign a node 'type' value
# designating if the node is a tweet, or a tag.


# create two seperate lists
tweets = edges['source'].to_frame('id')
tags = edges['target'].to_frame('id')

# Set the type column value to either tweet or tag depending and give 
tweets['type'] = 'tweet'
tweets['Label'] = tweets['id']
tags['type'] = 'tag'
tags['Label'] = tags['id']

nodes = tweets.append(tags).drop_duplicates('id')

nodes.head()

Unnamed: 0,id,type,Label
0,1329691747597869061,tweet,1329691747597869061
1,1329691771757088768,tweet,1329691771757088768
2,1329691785254359042,tweet,1329691785254359042
3,1329691810722181120,tweet,1329691810722181120
4,1329691875645775872,tweet,1329691875645775872


In [21]:
nodes.tail()

Unnamed: 0,id,type,Label
1846,satire,tag,satire
1862,Astroturfers,tag,Astroturfers
1867,BullyingisNEVERok,tag,BullyingisNEVERok
1883,ethics,tag,ethics
1884,resigns,tag,resigns


In [22]:
# merge in then retweet count and favorite count for the tweets. Merge should match appropriately
node_data = df[['id', 'retweet_count','favorite_count']]
nodes = nodes.merge(node_data, left_on='id', right_on='id', how='left')
nodes.head()

Unnamed: 0,id,type,Label,retweet_count,favorite_count
0,1329691747597869061,tweet,1329691747597869061,321.0,0.0
1,1329691771757088768,tweet,1329691771757088768,0.0,6.0
2,1329691785254359042,tweet,1329691785254359042,0.0,2.0
3,1329691810722181120,tweet,1329691810722181120,321.0,0.0
4,1329691875645775872,tweet,1329691875645775872,321.0,0.0


In [23]:
nodes.tail()
# tags don't have retweet or favorite counts

Unnamed: 0,id,type,Label,retweet_count,favorite_count
1657,satire,tag,satire,,
1658,Astroturfers,tag,Astroturfers,,
1659,BullyingisNEVERok,tag,BullyingisNEVERok,,
1660,ethics,tag,ethics,,
1661,resigns,tag,resigns,,


In [24]:
nodes.to_csv('tag_node_list.csv',index=False)
edges.to_csv('tag_edge-list.csv',index=False)