In [1]:
from twython import Twython
import pandas as pd
import json

# Getting tweets from twitter search (works with '@' and '#' prefix)
# ------------------------------------------------------------------

def connect_to_twitter(twitter_auth_path):
    with open(twitter_auth_path, 'r') as f:
        twitter_auth = json.load(f)
        APP_KEY = twitter_auth['APP_KEY'] # your app key
        APP_SECRET = twitter_auth['APP_SECRET'] # your app secret
        OAUTH_TOKEN = twitter_auth['OAUTH_TOKEN'] # your oauth token
        OAUTH_TOKEN_SECRET = twitter_auth['OAUTH_TOKEN_SECRET'] # your oauth token secret
    twitter = Twython(APP_KEY, APP_SECRET, OAUTH_TOKEN, OAUTH_TOKEN_SECRET)
    return twitter

def get_tweet_data(tweet, tweet_data):        
    tweet_data['text'] = tweet['text']
    tweet_data['hashtags'] = get_hashtags_string(tweet)
    tweet_data['tweet_id'] = tweet['id']
    tweet_data['created_at'] = tweet['created_at']
    tweet_data['retweet_count'] = tweet['retweet_count']
    tweet_data['favorite_count'] = tweet['favorite_count']
    tweet_data['in_reply_to_status_id'] = tweet['in_reply_to_status_id']
    tweet_data['in_reply_to_screen_name'] = tweet['in_reply_to_screen_name']    
    return tweet_data

def get_tweet_author_data(tweet, tweet_data):
    tweet_data['author'] = tweet['user']['screen_name']
    tweet_data['account_created_at'] = tweet['user']['created_at']
    tweet_data['author_description'] = tweet['user']['description']
    tweet_data['author_id'] = tweet['user']['id']
    tweet_data['author_location'] = tweet['user']['location']
    tweet_data['author_statuses_count'] = tweet['user']['statuses_count']
    tweet_data['author_followers_count'] = tweet['user']['followers_count']
    tweet_data['author_friends_count'] = tweet['user']['friends_count']
    tweet_data['author_favourites_count'] = tweet['user']['favourites_count']
    tweet_data['author_listed_count'] = tweet['user']['listed_count']
    return tweet_data

def get_hashtags_string(tweet):
    hashtags_string = ''
    for nr in range(len(tweet['entities']['hashtags'])):
        hashtags_string += tweet['entities']['hashtags'][nr]['text'] + ' '
    if not hashtags_string:
        hashtags_string = 'NO_HASHTAGS'
    return hashtags_string

def get_tweets_from_search(twitter, max_attempts, max_tweets_to_get, search_phrase):
    tweets_data = []
    for attempt_nr in range(0, max_attempts):
        if(max_tweets_to_get < len(tweets_data)):
            break

        if(0 == attempt_nr):
            search_results = twitter.search(q=search_phrase, count='100')
        else:
            search_results = twitter.search(q=search_phrase, include_entities='true', max_id=next_max_id, count='100')

        for tweet_data in search_results['statuses']:
            tweets_data.append(tweet_data)
        print('Loop: {} finished. Overall athered tweets: {}.'.format(attempt_nr, len(tweets_data)))

        try:
            metadata = search_results['search_metadata']['next_results']
            next_max_id = metadata.split('max_id=')[1]
            next_max_id = next_max_id.split('&')[0]
        except:
            break
    return tweets_data

def return_as_df(all_tweets_list):
    tweet_df = pd.DataFrame(all_tweets_list)
    display(tweet_df.head(5), tweet_df.shape)
    return tweet_df

def save_tweets_as_CSV(save_path, tweet_df):
    tweet_nr = len(tweet_df)
    tweet_df.to_csv(save_path, sep='\t', encoding='utf-8', index=False)
    print('{} tweets saved to {}'.format(tweet_nr, save_path))

def get_tweets_by_search_phrase(s, max_tweets_to_get, max_attempts, twitter_auth_path):
    twitter = connect_to_twitter(twitter_auth_path)
    tweets_data = get_tweets_from_search(twitter, max_attempts, max_tweets_to_get, search_phrase)

    extracted_tweets_data = []
    for tweet in tweets_data:
        tweet_data = {}
        tweet_data = get_tweet_data(tweet, tweet_data)
        tweet_data = get_tweet_author_data(tweet, tweet_data)
        extracted_tweets_data.append(tweet_data)
    
    tweet_df = return_as_df(extracted_tweets_data)
    
    save_path = search_phrase + '_tweets.csv'
    save_tweets_as_CSV(save_path, tweet_df)
    
    return tweet_df

In [6]:
# Use example:

search_phrase = 'science'
max_tweets_to_get = 50000
max_attempts = 50
twitter_auth_path = 'twitter_auth.json'
tweet_df = get_tweets_by_search_phrase(search_phrase, max_tweets_to_get, max_attempts, twitter_auth_path)

Loop: 0 finished. Overall athered tweets: 100.
Loop: 1 finished. Overall athered tweets: 200.
Loop: 2 finished. Overall athered tweets: 300.
Loop: 3 finished. Overall athered tweets: 400.
Loop: 4 finished. Overall athered tweets: 500.
Loop: 5 finished. Overall athered tweets: 600.
Loop: 6 finished. Overall athered tweets: 700.
Loop: 7 finished. Overall athered tweets: 800.
Loop: 8 finished. Overall athered tweets: 900.
Loop: 9 finished. Overall athered tweets: 1000.
Loop: 10 finished. Overall athered tweets: 1100.
Loop: 11 finished. Overall athered tweets: 1200.
Loop: 12 finished. Overall athered tweets: 1300.
Loop: 13 finished. Overall athered tweets: 1400.
Loop: 14 finished. Overall athered tweets: 1500.
Loop: 15 finished. Overall athered tweets: 1600.
Loop: 16 finished. Overall athered tweets: 1700.
Loop: 17 finished. Overall athered tweets: 1800.
Loop: 18 finished. Overall athered tweets: 1900.
Loop: 19 finished. Overall athered tweets: 2000.
Loop: 20 finished. Overall athered twee

Unnamed: 0,account_created_at,author,author_description,author_favourites_count,author_followers_count,author_friends_count,author_id,author_listed_count,author_location,author_statuses_count,created_at,favorite_count,hashtags,in_reply_to_screen_name,in_reply_to_status_id,retweet_count,text,tweet_id
0,Tue Aug 28 15:12:17 +0000 2018,epicypher,Cutting edge tools & services for epigenetics ...,167,123,340,1034458657105371137,1,"Durham, NC",277,Wed Jan 16 13:41:15 +0000 2019,0,NO_HASHTAGS,,,0,Comics are a great way to make science relatab...,1085532435348111360
1,Sun Jun 08 22:21:38 +0000 2014,MelissaL____,🇲🇽,63463,549,230,2555587788,4,nj,35026,Wed Jan 16 13:41:14 +0000 2019,0,NO_HASHTAGS,,,1222,RT @calacademy: Romeo—once thought to be the l...,1085532431430635525
2,Wed Feb 08 09:21:05 +0000 2017,SrNikhil7,Research Scholar @ IIT(BHU) Varanasi,1198,22,176,829258741795151872,0,"Varanasi, India",289,Wed Jan 16 13:41:14 +0000 2019,0,hikeresearchfellowship blackday4anusandhan 16j...,,,16,RT @Kali3Prasad: Research Scholars are in Jail...,1085532430801555456
3,Mon Feb 06 06:59:10 +0000 2012,edwardkeithdmd,https://t.co/7M360M0oqA.,19,59,77,484542042,9,"Makati, Philippines",1361,Wed Jan 16 13:41:13 +0000 2019,0,NO_HASHTAGS,,,0,Happy people! M.S. Ortho new year party 🎆😁 @ D...,1085532428129718272
4,Wed Jan 22 04:42:20 +0000 2014,Saigramam,Official Page Saigramam Kerala,0,790,766,2304224569,1,Kerala,1232,Wed Jan 16 13:41:13 +0000 2019,0,NO_HASHTAGS,,,0,Leaders Trainer Sri .Subash .Swamis Student .\...,1085532427932614657


(5000, 18)

5000 tweets saved to science_tweets.csv
