### Use Tweepy to scrape tweets about New Years Resolutions.

Scrape tweets and put them into a dataframe. 

_(Requires "Elevated" Twitter Developer access.)_

In [1]:
import tweepy
import pandas as pd
import time
import pickle

In [2]:
# Actual keys/tokens come from Twitter developer credentials. 

consumer_key = '###'
consumer_secret = '###'
access_token = '###'
access_token_secret = '###'

In [3]:
# Setup (and verify) Tweepy authorization.

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)


try:
    api.verify_credentials()
    print("Authentication OK")

except:
    print("Error during authentication")

Authentication OK


### Scrape the tweets (few different searches performed)

Unfortunately, limited to searching back only 7 days.

In [4]:
# Dataframe to write results to.
df_tweets_query = pd.DataFrame()

In [28]:
# Search 1.

try:
    # Creation of query method using parameters
    tweets = tweepy.Cursor(api.search_tweets,
                           q = 'new years resolution -filter:retweets',
                           lang = "en", # only tweets in english
                           until = "2022-01-08").items(10) # trying to get close to new years day.
    
    # Pulling information from tweets iterable object and adding relevant tweet information in our data frame
    for tweet in tweets:
        df_tweets_query = df_tweets_query.append(
                          {'Created': tweet.created_at,
                           'User_ID': tweet.id,
                           'User_Name': tweet.user._json['name'],
                           'Text': tweet.text,
                           'Description': tweet.user._json['description'],
                           'Location': tweet.user._json['location'],
                           'Followers_Count': tweet.user._json['followers_count'],
                           'Friends_Count': tweet.user._json['friends_count'],
                           'Favorite_Count': tweet.favorite_count,
                           'Retweet_Count': tweet.retweet_count,
                         }, ignore_index=True)

        
except BaseException as e:
    print('failed on_status,',str(e))
    time.sleep(3)

In [7]:
df_tweets_query.shape

(5000, 10)

In [11]:
# Search 2.

try:
    # Creation of query method using parameters
    tweets = tweepy.Cursor(api.search_tweets,
                           q = '2022 resolution',
                           lang = "en", # only tweets in english
                           until = "2022-01-08").items(5000)
    
    # Pulling information from tweets iterable object and adding relevant tweet information in our data frame
    for tweet in tweets:
        df_tweets_query = df_tweets_query.append(
                          {'Created': tweet.created_at,
                           'User_ID': tweet.id,
                           'User_Name': tweet.user._json['name'],
                           'Text': tweet.text,
                           'Description': tweet.user._json['description'],
                           'Location': tweet.user._json['location'],
                           'Followers_Count': tweet.user._json['followers_count'],
                           'Friends_Count': tweet.user._json['friends_count'],
                           'Favorite_Count': tweet.favorite_count,
                           'Retweet_Count': tweet.retweet_count,
                         }, ignore_index=True)
        
except BaseException as e:
    print('failed on_status,',str(e))
    time.sleep(3)

Rate limit reached. Sleeping for: 621
Rate limit reached. Sleeping for: 831


In [12]:
df_tweets_query.shape

(10000, 10)

In [7]:
# Search 3.
# After searches 1 and 2 (above), decided to go back and search for more tweets.
# This search was done ~1 week after the above searches (#1 and #2).
# Since it was a bit later, there's no date limit.

try:
    # Creation of query method using parameters
    tweets = tweepy.Cursor(api.search_tweets,
                           q = '#newyearresolution',
                           lang = "en").items(5000)
    
    # Pulling information from tweets iterable object and adding relevant tweet information in our data frame
    for tweet in tweets:
        df_tweets_query = df_tweets_query.append(
                          {'Created': tweet.created_at,
                           'User_ID': tweet.id,
                           'User_Name': tweet.user._json['name'],
                           'Text': tweet.text,
                           'Description': tweet.user._json['description'],
                           'Location': tweet.user._json['location'],
                           'Followers_Count': tweet.user._json['followers_count'],
                           'Friends_Count': tweet.user._json['friends_count'],
                           'Favorite_Count': tweet.favorite_count,
                           'Retweet_Count': tweet.retweet_count,
                         }, ignore_index=True)
        
except BaseException as e:
    print('failed on_status,',str(e))
    time.sleep(3)

Rate limit reached. Sleeping for: 691


In [10]:
# Search 4.
# After searches 1 and 2 (above), decided to go back and search for more tweets.
# This search was also done ~1 week after the above searches (#1 and #2).

try:
    # Creation of query method using parameters
    tweets = tweepy.Cursor(api.search_tweets,
                           q = '#newyearsresolution',
                           lang = "en").items(5000)
    
    # Pulling information from tweets iterable object and adding relevant tweet information in our data frame
    for tweet in tweets:
        df_tweets_query = df_tweets_query.append(
                          {'Created': tweet.created_at,
                           'User_ID': tweet.id,
                           'User_Name': tweet.user._json['name'],
                           'Text': tweet.text,
                           'Description': tweet.user._json['description'],
                           'Location': tweet.user._json['location'],
                           'Followers_Count': tweet.user._json['followers_count'],
                           'Friends_Count': tweet.user._json['friends_count'],
                           'Favorite_Count': tweet.favorite_count,
                           'Retweet_Count': tweet.retweet_count,
                         }, ignore_index=True)
        
except BaseException as e:
    print('failed on_status,',str(e))
    time.sleep(3)

Rate limit reached. Sleeping for: 295


In [11]:
df_tweets_query.shape

(6660, 10)

In [12]:
# Save and clean as needed later. 

with open('./data/tweets_n3.pickle', 'wb') as to_write:
    pickle.dump(df_tweets_query, to_write)