In [None]:
import tweepy, csv, time, re, urllib.error, os
import numpy as np
from urllib.request import urlopen
from datetime import date
import pandas as pd
import logging

# SETUP

The notebook was running on a remote machine. We created logs to be able to check the progress.

In [None]:
today = date.today().isoformat()
logging.basicConfig(filename='progress.log', format='%(asctime)s %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p')

Get your [own keys and secrets](https://apps.twitter.com/)

In [None]:
consumer_key = 'xxx'
consumer_secret = 'xxx'
access_token = 'xxx'
access_token_secret = 'xxx'

In [None]:
# create Tweepy handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

# FUNCTIONS

The Search API has a 15 minute window with limited amount of calls. We needed to sleep the script once the limit was reached.

In [5]:
def handle_errors(cursor):
    # sleep the script for 16 minutes
    while True:
        try:
            yield cursor.next()
        except tweepy.TweepError as e:
            logging.warning(e)
            time.sleep(16 * 60)

List of users who liked a post are not a part of the Twitter API, only the number of likes. We therefore used a scraper [based on this StackOverflow issue](http://stackoverflow.com/questions/28982850/twitter-api-getting-list-of-users-who-favorited-a-status).

In [None]:
def get_user_ids_of_post_likes(post_id):
    
    try:
        json_data = urlopen('https://twitter.com/i/activity/favorited_popup?id=' + str(post_id)).read()
        json_data = str(json_data)

        found_names = re.findall(r'data-screen-name=\\\\\"[a-zA-Z0-9_]+', json_data)
        unique_names = list(set([match.replace('data-screen-name=\\\\\"',"") for match in found_names]))

        return unique_names
    except urllib.error.HTTPError:
        return False

API reference: https://dev.twitter.com/overview/api/tweets

In [None]:
def tweet_details(tweet, df, row):
    # collect tweet details
    
    global users
    global statuses 
    global rts 
    global columns
    
    # if tweet is a reply, get outta here
    if tweet.in_reply_to_status_id_str != None:
        return
    
    # who favorited? = scraper above
    if tweet.favorite_count > 0:
        favorite_users = get_user_ids_of_post_likes(tweet.id)
        favorite_users.remove(tweet.user.screen_name)
        favorite_users = ";".join(favorite_users)
    else:
        favorite_users = []

    users.append(tweet.user.screen_name)
    statuses.append(tweet.id_str)
    
    try:
        original_tweet_id = tweet.retweeted_status.id_str
        original_tweet_user = tweet.retweeted_status.user.screen_name
    except:
        original_tweet_id = ""
        original_tweet_user = ""
        
    rts.append(original_tweet_id) # get original statuses from retweets

    values = [tweet.id_str,original_tweet_id,original_tweet_user,tweet.created_at,tweet.user.id_str,tweet.user.screen_name,
              tweet.user.followers_count,tweet.user.friends_count,tweet.user.description,
              tweet.user.statuses_count,tweet.text,tweet.favorite_count,favorite_users,
              tweet.retweet_count,str(tweet.entities)]   

    for c,v in zip(columns, values):
        df.loc[row,c] = v
    


## GET all recent tweets on keyword

- Ensure all parameters are properly URL encoded.
- Limit your searches to 10 keywords and operators.
- The Search API is not complete index of all Tweets, but instead an index of recent Tweets. The index includes between 6-9 days of Tweets.

[query operators](https://dev.twitter.com/rest/public/search)

In [None]:
# chose your keywords
keywords = ["klimaat", "co2", "climate", "IPCC", "windmolen", "zeespiegel", "turbine"]

## GET all the tweets!

In [None]:
users = []
statuses = []
rts = []

columns = ["tweet_id","original_tweet_id","original_tweet_user","date_time","user_id","username","followers",
           "friends","user_description","statuses","text",
           "nr_likes","users_likes","nr_RT","entities"]

df1 = pd.DataFrame(columns=columns)

for n,tweet in enumerate(handle_errors(tweepy.Cursor(api.search,
                       q=" OR ".join(keywords),
                       rpp=100,
                       result_type="recent",
                       include_entities=True,
                       lang="nl").items())): #you can change the language here

    tweet_details(tweet, df1, n)

There are several types of tweets you will get while from the Search API: **tweets, replies** and **re-tweets**<br>**Replies** that contain the keywords were often off-topic. We excluded them from the analysis.<br>**Re-tweets** were a *reaction*. But we not used them only to draw connections between the users, but also to get the original tweets (if not already in the data set).

In [None]:
# get deduplicated list of statusID to match against
statuses = list(set(statuses))
# get a list of original Tweet IDs of re-tweets to use in collecting the re-tweet parents
rts = list(set(rts))

In [None]:
df2 = pd.DataFrame(columns=columns)

for n,r in enumerate(rts):
    # get all original Tweets to the collected re-tweets
    if r not in statuses:
        logging.warning("%s : %s" % (today,n))
        
        try:
            tweet = api.get_status(r)
            tweet_details(tweet, df2, n)

        # wait for 16 minutes after reaching the rate limit
        except tweepy.RateLimitError as e:
            logging.warning("%s not processed because of %s" % (r, e.api_code)) 
            time.sleep(16 * 60)

        # some tweets are protected from retrieving so they were not collected
        except tweepy.TweepError as e:
            logging.warning("%s not processed because of %s" % (r, e.api_code)) 

Twitter API does not offer anything on **replies** to a Tweet (number, usernames).<br> It does however mention if a particular Tweet was a reply to another tweet.<br> We therefore searched for ALL the replies to ALL the users in the data set. Then we filtered them to keep only the ones that reply to Twitter statuses in the data set. This part took the longest (about a day).

In [None]:
r_columns = ["tweet_id","date_time","user_id","username","response_to_user_id","response_to_username","response_to_status_id"]

df3 = pd.DataFrame(columns=r_columns)

count = 0
row = 0

for us in np.array_split(users,len(users)/10): # http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    # collect replies to Tweets
    count +=10
    logging.warning(count)
    searchterm = " OR ".join(["to:"+x for x in us])
    
    for tweet in handle_errors(tweepy.Cursor(api.search,
                               q=searchterm,
                               rpp=100,
                               result_type="recent",
                               include_entities=True,
                               lang="nl").items()): #you can change the language here

        if tweet.in_reply_to_status_id_str in statuses:
            row += 1
            r_values = [tweet.id_str,tweet.created_at,tweet.user.id_str,tweet.user.screen_name,
                 tweet.in_reply_to_user_id_str,tweet.in_reply_to_screen_name, tweet.in_reply_to_status_id_str]

            for c,v in zip(r_columns, r_values):
                df3.loc[row,c] = v

# MERGING data

Merged original re-tweeted tweets with the main data set

In [None]:
data = df1.append(df2).reset_index()

We created new columns from the *replies* data to add to the main data set: *number of responses* and *users that responded*

In [None]:
# count replies
nr_responses = df3.groupby("response_to_status_id").count()["tweet_id"]

# get usernames
users_responses = df3.groupby("response_to_status_id")["username"].apply(lambda x: ";".join(x.tolist()))

# join the datasets
calc = pd.concat([nr_responses, users_responses], axis=1)
calc.columns = ["nr_responses", "users_responses"]
calc = calc.reset_index()

In [None]:
# merge datasets
dataset = pd.merge(left=data, right=calc, left_on = "tweet_id", right_on="response_to_status_id", how="left")
dataset = dataset.drop("response_to_status_id", axis = 1)

## BACKUP RAW DATASETS

In [None]:
if not os.path.exists(today):
    os.makedirs(today)
    
df1.to_csv("%s/%s-sleutelwoorden_matches.csv" % (today,today))
df2.to_csv("%s/%s-retweets.csv" % (today,today))
df3.to_csv("%s/%s-replies.csv" % (today,today))
dataset.to_csv("%s/%s-klimaat_tweets.csv" % (today,today))