In [None]:
"""
Author: Liam Lee
This script is designed to interface with the Twitter API using Tweepy to stream live tweets. It filters tweets based on specified criteria, collects them in real-time, and processes the data into a structured format suitable for analysis. The script handles authentication, defines a stream listener to capture tweets, and converts the JSON output to a Pandas DataFrame, selecting relevant information for further demographic analysis.
"""

In [1]:
from tweepy import API
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import tweepy
import json
import sys
import pandas as pd

In [None]:
# Create authentication credentials for Twitter API access

CONSUMER_KEY = '######################' 
CONSUMER_SECRET = '######################'
ACCESS_TOKEN = '######################'
ACCESS_TOKEN_SECRET = '######################'

auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
# Set up the OAuthHandler and API objects for Tweepy with rate limit parameters to avoid hitting Twitter's rate limits.
api = API(auth, wait_on_rate_limit=True,
          wait_on_rate_limit_notify=True)


class Listener(StreamListener):
    '''
    This class will define how to handle incoming streaming data (tweets).
    '''
    def __init__(self):
        super(Listener,self).__init__()
        self.output_list = [ ]
        
    def on_status(self, status):
        self.output_list.append(status)
        if (not status.retweeted) and ('RT @' not in status.text):  #exclude retweet
            self.output_list.append(status)
        else:
            pass
    
    def on_error(self, status_code):
        print(status_code)
        return False

In [None]:
# Execute data collection through streaming

l = Listener()
stream = Stream(auth=api.auth, listener=l)
try:
    print('Start streaming.')
    stream.filter(track = ['###']  #your keywords or hashtags here
                  ,languages=['en']) 
    
except KeyboardInterrupt:
    print("Stopped.")
    
finally:
    print('Done.')
    stream.disconnect()

In [None]:
#we got json
l.output_list[0]._json.keys()

In [None]:
# Convert JSON to a pandas DataFrame for easier data manipulation and analysis.
# We choose specific fields that are relevant to our analysis from the tweet JSON objects.

dict_ = {'screen_name': [], 'id': [], 'location': [], 'created_at': [], 'text': [], 
        'description': []}

for i in range(0,len(l.output_list)):
    dict_['screen_name'].append(l.output_list[i]._json['user']['screen_name'])
    dict_['id'].append(str(l.output_list[i]._json['user']['id_str']))
    dict_['location'].append(l.output_list[i]._json['user']['location'])
    dict_['created_at'].append(l.output_list[i]._json['created_at'])
    dict_['text'].append(l.output_list[i]._json['text'])
    dict_['description'].append(l.output_list[i]._json['user']['description'])
    

df = pd.DataFrame(dict_)
df.sort_values(by='created_at', inplace=True, ascending=False)
df.reset_index(drop= True, inplace= True)
df.head(5)