# Twitter Retweet Network

In this tutorial, we will be creating a Twitter retweet network from a set of tweets. 

Note that public Twitter data sets that you will find will only contain the Tweet IDs (as per Twitter's policy), so you will have to "rehydrate" each tweet by using the Twitter API to request the full text. 

We'll also be using Tweepy - which is a python wrapper that allows us to use the Twitter API. Of course, the Twitter API can be accessed directly as well, but Tweepy has a few extras that makes getting Tweets just a bit easier. 

We provide two ways of visualizing the graph itself - one through iGraph and the other by exporting the networkx generated network into a Gephi file (note that the extension is deprecated by Gephi but is what networkx supports and still works).  

### Resources and Links
https://gephi.org/

http://docs.tweepy.org/en/latest/


In [None]:
import json
import networkx as nx
import wget
import igraph as ig
import tweepy

# Save the location of the Tweet JSON file
TWEET_JSON = "covid.json"

In [None]:
# create our tweet dictionary that filters out all the information we are not interested in
# a clean tweet object extracts and contains the following attributes from the raw tweet: 
# the original tweet's : id, user, text
# if the tweet is a retweet, our object also contains the retweet's: rt_text, rt_user, rt_id
# along with a boolean value is_rt that indicates whether the current tweet is a retweet. 

def clean_tweet(tweet): 
    processed_tweet = {
        "id" : tweet['id'], 
        "user" : tweet['user']['screen_name'], 
        "created_at" : tweet['created_at']
    }
    if "full_text" in tweet: 
        processed_tweet["text"] = tweet['full_text']
    elif "text" in tweet: 
        processed_tweet["text"] = tweet['text']
        
    if 'retweeted_status' in tweet:
        rt = tweet['retweeted_status']
        processed_tweet["is_rt"] = True
        processed_tweet["rt_user"] = rt['user']['screen_name']
        processed_tweet["rt_id"] = rt['id']        
        if "full_text" in rt:
            processed_tweet["rt_text"] = rt['full_text']
        elif "text" in rt: 
            processed_tweet["rt_text"] = rt['text']
    else: 
        processed_tweet["is_rt"] = False
            
    return processed_tweet
            
# load the raw tweets into a list
tweet_data = []
with open(TWEET_JSON) as f:
    for line in f:
        json_line = json.loads(line)
        tweet_data.append(json_line)

# filter all of the raw tweets by turning them into clean_tweet objects
# the filtering is taken care of in the class function
filtered_data = []
for elem in tweet_data: 
    filtered_tweet = clean_tweet(elem)
    filtered_data.append(filtered_tweet)

In [None]:
# priting out the filtered_data, which contains clean_tweet objects 
filtered_data

In [None]:
# printing the first element of the unfiltered data as an example
print(tweet_data[0])
print ("---")
# printing an example of a retweet object as an example
print(tweet_data[1])

In [None]:
# printing the first element of the filtered data
print(filtered_data[0])
print ("---")
# printing an example of a retweet object
print(filtered_data[1])


In [None]:
# how many total tweets we have
len(filtered_data)

### Getting New Tweets

We now walk through how to use the search and stream Twitter APIs. Before you can, you'll need to create a developer account and create/register your application wtih Twitter to get the necessary authentication keys. 

https://developer.twitter.com/

In [None]:
info = {"api_key": "",
        "api_secret": "",
        "access_token": "",
        "access_secret": ""}

In [None]:
# Let's use the Twitter Search API to look up past tweets
# We save our tweets to a file called "housedemocrats_search.json"
auth = tweepy.OAuthHandler(info['api_key'], info['api_secret'])
auth.set_access_token(info['access_token'], info['access_secret'])

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

count = 0
for tweet in tweepy.Cursor(api.search,
                           q="covid",
                           count=100,
                           result_type="recent",
                           include_entities=True,
                           lang="en").items():
    
    tweet_json = json.dumps(tweet._json)
    with open ("covid_search.json", "a+") as search_f:
        json_text = json.dumps(tweet._json)
        search_f.write(json_text)
        search_f.write('\n')
    if count > 100: 
        break
    count+=1

In [None]:
# Let's use the Twitter Stream API to get tweets in real time
# We save our tweets to a file called "covid_stream.json"
#override tweepy.StreamListener to add logic to on_status and on_error 
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print(status._json)
        with open ("covid_stream.json", "a+") as stream_f:
            json_text = json.dumps(status._json)
            stream_f.write(json_text)
            stream_f.write('\n')
            
    def on_error(self, status_code):
        print("Error detected!")
        print (status_code)
        return False
    
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)
myStream.filter(track=['covid'])

In [None]:
# load the raw tweets found from the search and stream apis into the same list for processing
new_tweet_data = []
with open("covid_search.json") as new_search_f:
    for line in new_search_f:
        json_line = json.loads(line)
        new_tweet_data.append(json_line)
        
with open("covid_stream.json") as new_stream_f:
    for line in new_stream_f:
        json_line = json.loads(line)
        new_tweet_data.append(json_line)
# filter all of the raw tweets by turning them into clean_tweet objects
# the filtering is taken care of in the class function

new_filtered_data = []
for elem in new_tweet_data: 
    new_filtered_tweet = clean_tweet(elem)
    new_filtered_data.append(new_filtered_tweet)

In [None]:
# example of one of our new tweets
new_filtered_data[0]

In [None]:
# Append the two lists together to form a full list to send into our graph
total_filtered_data = filtered_data + new_filtered_data

In [None]:
# construct a directed graph for retweet graph
G=nx.DiGraph()

for elem in total_filtered_data: 
    # for each tweet, we add the tweet's user to the graph
    G.add_node(elem["user"])
    print (elem["user"])

    # if this tweet is a retweet, we add the original tweet's user to the graph
    # and create an edge between the current user and the original tweet's user
    # current user -> original tweet's user
    if elem["is_rt"]: 
        G.add_node(elem["rt_user"])
        print (elem["rt_user"])
        G.add_edge(elem["user"], elem["rt_user"])
    print ("--")

In [None]:
# printing out the number of nodes (users) in our graph
print (len(G.nodes()))
# printing out the number of edges/links in our graph
print (len(G.edges()))

In [None]:
# writing our graph to a Gephi file format
nx.write_gexf(G, "covid-total.gexf")

In [None]:
# Visualizing our retweet network
# This code was borrowed from the Plotly documentation
# https://plot.ly/python/v3/igraph-networkx-comparison/

G2 = ig.Graph.TupleList(G.edges(), directed=True)
labels=list(G2.vs['name'])
N = len(labels)
E = [e.tuple for e in G2.es]
layt=G2.layout('kk') #kamada-kawai layout

In [None]:
# Below is code to visualize the graph. Treat as blackbox for now!
import chart_studio.plotly  as py
from plotly.graph_objs import *

Xn=[layt[k][0] for k in range(N)]
Yn=[layt[k][1] for k in range(N)]
Xe=[]
Ye=[]
for e in E:
    Xe+=[layt[e[0]][0],layt[e[1]][0], None]
    Ye+=[layt[e[0]][1],layt[e[1]][1], None]
    
trace1=Scatter(x=Xe,
               y=Ye,
               mode='lines',
               line= dict(color='rgb(210,210,210)', width=1),
               hoverinfo='none'
               )
trace2=Scatter(x=Xn,
               y=Yn,
               mode='markers',
               name='ntw',
               marker=dict(symbol='circle-dot',
                                        size=5,
                                        color='#6959CD',
                                        line=dict(color='rgb(50,50,50)', width=0.5)
                                        ),
               text=labels,
               hoverinfo='text'
               )

axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title=''
          )

axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
          zeroline=False,
          showgrid=False,
          showticklabels=False,
          title=''
          )

width=800
height=800
layout=Layout(title= "Twitter Retweet Network",
    font= dict(size=12),
    showlegend=False,
    autosize=False,
    width=width,
    height=height,
    xaxis=layout.XAxis(axis),
    yaxis=layout.YAxis(axis),
    margin=layout.Margin(
        l=40,
        r=40,
        b=85,
        t=100,
    ),
    hovermode='closest',
    annotations=[
           dict(
           showarrow=False,
            text='',
            xref='paper',
            yref='paper',
            x=0,
            y=-0.1,
            xanchor='left',
            yanchor='bottom',
            font=dict(
            size=14
            )
            )
        ]
    )

data=[trace1, trace2]
fig=Figure(data=data, layout=layout)

In [None]:
fig.show()