<a href="https://colab.research.google.com/github/Harsh-Mundra/Mccombs_course/blob/master/4_tweepy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download a file from Google Drive using FILE_ID

# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive tweepy
import os 
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# Download a file based on its file ID.
#
# A file ID looks like: 1rhnzJ8CD9Amdz5OMi8s4kqT5b00jY5k4
# Visit https://help.meiro.io/en/articles/2245027-where-can-i-find-the-file-id-on-google-drive
# for more details on getting the FILE ID.

def download_file(file_id, file_name):
    '''
    file_id: Make sure that you can access this file from your account or else, this will not run
    file_name: Name of the file by which you want to save
    '''
    downloaded = drive.CreateFile({'id': file_id})
    downloaded.GetContentFile(file_name)
    print('Downloaded file with ID {} and name {}'.format(file_id, file_name))

In [None]:
if not os.path.isfile('secrets.txt'):
    download_file('1zf_8E7G1-o4ywB4cnjUa-aVl_mMSxlyH', 'secrets.txt')

Downloaded file with ID 1zf_8E7G1-o4ywB4cnjUa-aVl_mMSxlyH and name secrets.txt


In [None]:
# You can store secrets in a file or in the form of environment variables during production.
# NEVER store keys directly on notebook.

with open('secrets.txt', 'r') as file:
    data = file.read().split('\n')

import tweepy 
import pandas as pd

consumer_key = data[0]
consumer_secret = data[1]
access_key = data[2]
access_secret = data[3]

In [None]:
def get_user_tweets(screen_name, num=0):
    #Twitter only allows access to a users most recent 3000 tweets with this method
    num = 3000 if num > 3000 else num
    max_num_per_call = 200

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    curr_count = max_num_per_call if num > max_num_per_call else num
    num -= curr_count

    new_tweets = api.user_timeline(screen_name=screen_name, count=curr_count)
    
    tweet = new_tweets[0]
    print(f"Location of username {screen_name} is: {tweet.user.location}\n")
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    print(f"{len(alltweets)} tweets downloaded so far")
    
    #keep grabbing tweets until there are no tweets left to grab
    while num > 0:
        print(f"Getting tweets before {oldest}")
        
        curr_count = max_num_per_call if num > max_num_per_call else num

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name, count=curr_count, max_id=oldest)
        num -= curr_count
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print(f"{len(alltweets)} tweets downloaded so far")
    
    #transform the tweepy tweets into a 2D array that will populate the csv    
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
    df = pd.DataFrame(outtweets, columns=["id","created_at","text"])
    df.to_csv(f"user_{screen_name}.csv", index=False)
    print(df.head())

In [None]:
states = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'DC': 'District of Columbia',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming'
}

In [None]:
def get_query_tweets(query, num=0):
    num = 3000 if num > 3000 else num
    max_num_per_call = 100

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []    
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    curr_count = max_num_per_call if num > max_num_per_call else num
    num -= curr_count

    new_tweets = api.search(q=query, count=curr_count)
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    print(f"{len(alltweets)} tweets downloaded so far")

    #keep grabbing tweets until there are no tweets left to grab
    while num > 0:
        print(f"Getting tweets before {oldest}")
        
        curr_count = max_num_per_call if num > max_num_per_call else num

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.search(q=query, count=curr_count, max_id=oldest)
        num -= curr_count
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print(f"{len(alltweets)} tweets downloaded so far")
    
    #transform the tweepy tweets into a 2D array that will populate the csv    
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8"), tweet.user.location] for tweet in alltweets]
    df = pd.DataFrame(outtweets, columns=["id", "created_at", "text", "location"])
    df.to_csv(f"query_{query}.csv", index=False)
    print(df.head())

In [None]:
# pass in the username of the account you want to download
get_user_tweets("TwitterMusic", 390)

Location of username TwitterMusic is: Twitter HQ

200 tweets downloaded so far
Getting tweets before 1172258530344210441
390 tweets downloaded so far
                    id  ...                                               text
0  1183423921284157441  ...  b'RT @halsey: stan twitter is crazy cause i me...
1  1183120180446142466  ...                 b'@psih_polunoch \xf0\x9f\x92\x9c'
2  1183118670140182529  ...                          b'@umidiotaamenos agreed'
3  1183118331777343489  ...  b'@owengdriscoll *nods head in approval* this ...
4  1183111570265640961  ...     b"What's your holy trinity? #NationalAlbumDay"

[5 rows x 3 columns]


In [None]:
# pass in the search query
get_query_tweets("Management", 290)

100 tweets downloaded so far
Getting tweets before 1183570976572891136
190 tweets downloaded so far
                    id  ...               location
0  1183571861210316801  ...                       
1  1183571844588175360  ...              Australia
2  1183571839164936193  ...       Calgary, Alberta
3  1183571818671726592  ...                  India
4  1183571816255647744  ...  All Around The World 

[5 rows x 4 columns]
