In [2]:
import datetime
from time import sleep
import os
import tweepy as tw
import pandas as pd
import csv

In [None]:
# API Keys

# Twitter API Keys
consumer_key = 'consumer_key'
consumer_secret = 'consumer_secret'
access_token = 'access_token'
access_token_secret = 'access_token_secret'

In [4]:
# Twitter API Setup
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [15]:
# Create Dataframe to store tweets
new_tweets_df = pd.DataFrame(columns=['UserID', 'TweetID', 'Tweet', 'CreatedAt', 'Spam', 'URLs Expanded'])

In [5]:
# Select United Kingdom as query constraint
places = api.geo_search(query="United Kingdom", granularity="country")
place_id = places[2].id

In [17]:
def save_tweet_data(tweets):
    if not os.path.exists('../Data/collected_tweet_data.txt'):
       with open('../Data/collected_tweet_data.txt', 'w', encoding="utf-8") as f:
           pass     
    with open('../Data/collected_tweet_data.txt', 'r', encoding="utf-8") as f:
        reader = csv.reader(f, delimiter='\n')
        existing = set()
        for line in reader:
            line_data = line[0]
            existing.add(str(line_data))
            
    with open('../Data/collected_tweet_data.txt', 'a', encoding="utf-8") as f:
        for tweet in tweets:
            if tweet.lang == 'en':
                tweet_data = str(tweet._json)
                if tweet_data not in existing:
                    f.write(tweet_data)
                    f.write('\n')

In [18]:
def get_tweets():
  
  # Retrieve tweets
  tweets = api.search(q="place:%s -filter:retweets" % place_id)
  
  save_tweet_data(tweets)
  
  # Analyze tweet metadata and store
  for tweet in tweets:
    
    # Skip tweet if already logged or not in english
    if tweet.id in new_tweets_df.TweetID or tweet.lang != 'en':
      break
    
    # Get any URLs from tweet and expand them
    urls = []
    if tweet.entities['urls'] != []:
      for url in tweet.entities['urls']:
        urls.append(url['expanded_url'])
        
    # Record tweet in dataframe
    new_tweets_df.loc[len(new_tweets_df.index)] = [tweet.user.id, tweet.id, tweet.text, tweet.created_at, 'Unknown', urls]
    tweets = None
    


In [19]:
def test_rate_limit(api, wait=True, buffer=.1):
    """
    Tests whether the rate limit of the last request has been reached.
    :param api: The `tweepy` api instance.
    :param wait: A flag indicating whether to wait for the rate limit reset
                 if the rate limit has been reached.
    :param buffer: A buffer time in seconds that is added on to the waiting
                   time as an extra safety margin.
    :return: True if it is ok to proceed with the next request. False otherwise.
    """
    # Get number of remaining requests
    remaining = int(api.last_response.headers.get('x-rate-limit-remaining'))
    # Check if limit has been reached
    if remaining == 0:
        limit = int(api.last_response.headers.get('x-rate-limit-limit'))
        reset = int(api.last_response.headers.get('x-rate-limit-reset'))
        # Get UTC time
        reset = datetime.datetime.fromtimestamp(reset)
        print("Paused, will resume at", reset)

        if wait:
            # Determine delay and sleep
            delay = (reset - datetime.datetime.now()).total_seconds() + buffer
            print("Sleeping for {}s...".format(delay), 'at:', datetime.datetime.now())
            sleep(delay)
            return True
        else:
            return False
        
    return True

In [20]:
def save_tweets(df):
    df = df.drop_duplicates(subset=['TweetID'])
    
    if not os.path.exists('../Data/collected_tweets.csv'):
        df.to_csv('../Data/collected_tweets.csv', mode='w', index=False)
    else:
        df.to_csv('../Data/collected_tweets.csv', mode='a', index=False, header=False)
        
    
    print('Tweets updated at:', datetime.datetime.now())
    
    return df.iloc[0:0]

In [21]:
def remove_duplicates():
    try:
        collected_tweets_df = pd.read_csv('../Data/collected_tweets.csv')
        print('Unfiltered tweets:', len(collected_tweets_df))
    except Exception as e:
        print(e)
    collected_tweets_df = collected_tweets_df.drop_duplicates(subset=['TweetID'])
    print('Unique tweets:', len(collected_tweets_df))
    collected_tweets_df.to_csv('../Data/collected_tweets.csv', mode='a', index=False, header=False)

In [None]:
# Run to start collecting tweets automatically within API Rate limits
start_time = datetime.datetime.now()
loop_time = start_time
try:
  
  while test_rate_limit(api):
    get_tweets()
    test_time = datetime.datetime.now()
    if (test_time - loop_time).seconds > 300:
      new_tweets_df = save_tweets(new_tweets_df)
      loop_time = datetime.datetime.now()
  
  if not test_rate_limit(api):
    print('Limit Reached at', datetime.datetime.now())
  
except Exception as e:
  new_tweets_df = save_tweets(new_tweets_df)
  print("Stopped unexpectedly")
  print(e)