In [6]:
from twython import Twython
import pandas as pd
import json

# Getting tweets from user timeline
# ---------------------------------


def connect_to_twitter(twitter_auth_path):
    with open(twitter_auth_path, 'r') as f:
        twitter_auth = json.load(f)
        APP_KEY = twitter_auth['APP_KEY'] # your app key
        APP_SECRET = twitter_auth['APP_SECRET'] # your app secret
        OAUTH_TOKEN = twitter_auth['OAUTH_TOKEN'] # your oauth token
        OAUTH_TOKEN_SECRET = twitter_auth['OAUTH_TOKEN_SECRET'] # your oauth token secret
    twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
    return twitter

# Picking intresting data from Twitter API response file
# ------------------------------------------------------

def get_tweet_data(tweet, tweet_data):        
    tweet_data['text'] = tweet['text']
    tweet_data['hashtags'] = get_hashtags_string(tweet)
    tweet_data['link'] = get_expanded_url_of_link_in_tweet(tweet)
    tweet_data['tweet_id'] = tweet['id']
    tweet_data['created_at'] = tweet['created_at']
    tweet_data['retweet_count'] = tweet['retweet_count']
    tweet_data['favorite_count'] = tweet['favorite_count']
    tweet_data['in_reply_to_status_id'] = tweet['in_reply_to_status_id']
    tweet_data['in_reply_to_screen_name'] = tweet['in_reply_to_screen_name']    
    return tweet_data

def get_tweet_author_data(tweet, tweet_data):
    tweet_data['author'] = tweet['user']['screen_name']
    tweet_data['account_created_at'] = tweet['user']['created_at']
    tweet_data['author_description'] = tweet['user']['description']
    tweet_data['author_id'] = tweet['user']['id']
    tweet_data['author_location'] = tweet['user']['location']
    tweet_data['author_statuses_count'] = tweet['user']['statuses_count']
    tweet_data['author_followers_count'] = tweet['user']['followers_count']
    tweet_data['author_friends_count'] = tweet['user']['friends_count']
    tweet_data['author_favourites_count'] = tweet['user']['favourites_count']
    tweet_data['author_listed_count'] = tweet['user']['listed_count']
    tweet_data['author_url'] = get_expanded_url_of_link_in_author_account(tweet)   
    return tweet_data

# Helpers for extracting Twitter API response file
# ------------------------------------------------

def get_hashtags_string(tweet):
    hashtags_string = ''
    for nr in range(len(tweet['entities']['hashtags'])):
        hashtags_string += tweet['entities']['hashtags'][nr]['text'] + ' '
    if not hashtags_string:
        hashtags_string = 'NO_HASHTAGS'
    return hashtags_string

def get_expanded_url_of_link_in_tweet(tweet):
    url = tweet['entities']['urls']
    if url:
        return url[0]['expanded_url']
    else:
        return None
    
def get_expanded_url_of_link_in_author_account(tweet):
    url = tweet['user']['entities']['url']['urls']
    if url:
        return url[0]['expanded_url']
    else:
        return None

# Save tweets dataset
# -------------------

def save_tweets_as_CSV(save_path, all_tweets_list, tweet_nr):
    tweet_df = pd.DataFrame(all_tweets_list)
    tweet_df.to_csv(save_path+'.csv', sep='\t', encoding='utf-8', index=False)
    print('{} tweets saved to {}'.format(tweet_nr, save_path+'.csv'))

def save_tweets_as_JSON(save_path, all_tweets, tweet_nr):
    with open(save_path+'.json', 'w') as f:
        json.dump(all_tweets, f)
        print('{} tweets saved to {}'.format(tweet_nr, save_path+'.json'))

# Main
# ----

def get_twitter_user_timeline(twitter, account_name):
    search_pages = twitter.cursor(twitter.get_user_timeline, screen_name=account_name, return_pages=True, count=200)
    return search_pages

def get_all_tweets_from_user_timelne(search_pages, save_path):
    try:
        all_tweets, all_tweets_list = {}, []
        tweet_nr, page_nr = 0, 0
        for page in search_pages:
            page_nr +=1
            for tweet in page:
                tweet_nr +=1
                tweet_data = {}
                tweet_data = get_tweet_data(tweet, tweet_data)
                tweet_data = get_tweet_author_data(tweet, tweet_data)
                all_tweets[str(tweet_nr)] = tweet_data
                all_tweets_list.append(tweet_data)
            print('Pages done: {}, tweets done: {}'.format(page_nr, tweet_nr))
    except:
        save_tweets_as_JSON(save_path, all_tweets, tweet_nr)
        save_tweets_as_CSV(save_path, all_tweets_list, tweet_nr)

def get_user_tweets(account_name, save_path, twitter_auth_path):
    twitter = connect_to_twitter(twitter_auth_path)
    search_pages = get_twitter_user_timeline(twitter, account_name)
    get_all_tweets_from_user_timelne(search_pages, save_path)

In [2]:
# Example with Microsoft Research Twitter page
# --------------------------------------------

account_name = 'MSFTResearch'  
save_path = account_name + '_timeline_tweets'
twitter_auth_path = 'twitter_auth.json'
get_user_tweets(account_name, save_path, twitter_auth_path)

Pages done: 1, tweets done: 200
Pages done: 2, tweets done: 400
Pages done: 3, tweets done: 600
Pages done: 4, tweets done: 800
Pages done: 5, tweets done: 999
Pages done: 6, tweets done: 1199
Pages done: 7, tweets done: 1399
Pages done: 8, tweets done: 1598
Pages done: 9, tweets done: 1798
Pages done: 10, tweets done: 1998
Pages done: 11, tweets done: 2198
Pages done: 12, tweets done: 2398
Pages done: 13, tweets done: 2598
Pages done: 14, tweets done: 2798
Pages done: 15, tweets done: 2997
Pages done: 16, tweets done: 3197
Pages done: 17, tweets done: 3205
3205 tweets saved to MSFTResearch_timeline_tweets.json
3205 tweets saved to MSFTResearch_timeline_tweets.csv


In [8]:
# Load tweets from csv
# --------------------

tweets_df = pd.read_csv(save_path + ".csv", sep='\t', )
tweets_df = tweets_df.loc[:,['text','retweet_count']]
pd.options.display.max_colwidth = 300
print("Tweets dataset shape: {}".format(tweets_df.shape))
print("Tweets dataset:")
tweets_df.head(10)

Tweets dataset shape: (3205, 2)
Tweets dataset:


Unnamed: 0,text,retweet_count
0,.@jennwvaughan and @hannawallach will lead a webinar on responsible #AI on January 24. Learn how to make detecting… https://t.co/PQWaLjBwrs,12
1,"As machine learning becomes more prevalent, it’s critical to guard against bias. Next week, Microsoft researchers J… https://t.co/adNgyL5Djq",67
2,"RT @erichorvitz: ""Boffins!""@theregister @cervisiarius\r\n ""Computing boffins strip the fun out of satirical headlines"" https://t.co/G8rUsowsaT…",9
3,Celebrate International Day of Women and Girls in Science at our I Chose STEM event in San Francisco! Speakers will… https://t.co/CdcDqqmP0z,16
4,Push the state-of-the-art in software production as a researcher with @RiSE_MSR. Apply now if you're interested in… https://t.co/QvxL4nI0GU,9
5,University students are invited to identify and solve education pain-points through #AI for a chance to attend an a… https://t.co/5XTOzJ6CBF,15
6,The entertainment industry has long offered us a vision of personal assistants that not only meet our stated needs… https://t.co/BIjNRQeHTb,10
7,"Microsoft researchers, engineers &amp; collaborators are proving theorems about high performance, security critical cod… https://t.co/XphSCfR4nr",25
8,"RT @AndrewDGordon: Grad students, fancy an @MSFTResearchCam internship on PL applied to @msexcel spreadsheets - see https://t.co/LphKBlAu8N…",17
9,"""One of the joys of working at Microsoft Research is the ability to directly influence mainstream software technolo… https://t.co/Y1W4q8Hwuf",56
