# TWITTER ANALYSIS - MENTORSHIP PROJECT


#### STEP 1. CONFIGURATION: ESTABLISHING CONNECTION TO THE API
*Using Tweepy*

In [1]:
import tweepy 
import configparser
import requests     # For saving access tokens and for file management when creating and adding to the dataset
import os           # For dealing with json responses we receive from the API
import json         # For displaying the data after
import pandas as pd # For saving the response data in CSV format
import csv          # For parsing the dates received from twitter in readable formats
import datetime
import dateutil.parser
import unicodedata  #To add wait time between requests
import time
import sqlite3

In [2]:
#read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key             = config['twitter']['api_key']
api_key_secret      = config['twitter']['api_key_secret']

access_token        = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

bearer_token        = config['twitter']['bearer_token']

In [3]:
#Authenticate our account with the Twitter API
auth    = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api     = tweepy.API(auth, wait_on_rate_limit=True)
    
# You can authenticate as your app with just your bearer token
client  = tweepy.Client(bearer_token=bearer_token)

# If the authentication was successful, this should print the
# screen name / username of the account
print(api.verify_credentials().screen_name)

KLuthra_


### STEP 2. DATA EXTRACTION & STORAGE
####  2.1. Defining Data Model Schemas for Tweet & User Data

In [4]:
# Set up SQLite database
conn = sqlite3.connect('twitter_data.db')
c = conn.cursor()

In [5]:
# Drop existing tables
# conn.close()
# c.execute('DROP TABLE IF EXISTS tweets')
# c.execute('DROP TABLE IF EXISTS users')

In [6]:
# # Create table for tweet data
# c.execute('''CREATE TABLE IF NOT EXISTS tweets
#              (tweet_id TEXT PRIMARY KEY,
#               author_id TEXT,
#               created_at TIMESTAMP,
#               text TEXT,
#               tweet_metrics JSON,
#               entities JSON,
#               context JSON,
#               place_id JSON,
#               FOREIGN KEY (author_id) REFERENCES users(author_id),
#               FOREIGN KEY (place_id) REFERENCES users(place_id))''')

<sqlite3.Cursor at 0x21590372f80>

In [11]:
# c.execute("SELECT COUNT(*) FROM tweets")
# row_count = c.fetchone()[0]
# print(f"Number of rows in 'tweets' table: {row_count}")

Number of rows in 'tweets' table: 0


In [7]:
# # Create table for user data
# c.execute('''CREATE TABLE IF NOT EXISTS users
#              (author_id TEXT PRIMARY KEY,
#               username TEXT,
#               verified TEXT,
#               bio TEXT,
#               author_created TIMESTAMP,
#               author_location TEXT,
#               followers_count INTEGER,
#               following_count INTEGER,
#               tweet_count INTEGER,
#               entities JSON,
#               FOREIGN KEY (author_id) REFERENCES tweets(author_id))''')

<sqlite3.Cursor at 0x21590372f80>

####  2.2. Defining a tweet fetching function using Tweepy

**__Pagination:__** Pagination is a feature in Twitter API v2 endpoints that return more results than can be returned in a single response. When that happens, the data is returned in a series of 'pages'. Pagination refers to methods for programatically requesting all of the pages, in order to retrieve the entire result data set. Not all API endpoints support or require pagination, but it is often used when result sets are large.

**Paginator** can be used to paginate for any Client methods that support pagination

In [12]:
def get_tweets(query, max_results):

  expansions    = ['author_id','in_reply_to_user_id','geo.place_id','entities.mentions.username','referenced_tweets.id','referenced_tweets.id.author_id']
  tweet_fields  = ['id','text','author_id','attachments','context_annotations','created_at','entities','lang','geo','public_metrics']
  user_fields   = ['id','name','username','created_at','description','entities','location','public_metrics','verified']
  place_fields  = ['full_name','id','country','country_code','geo','name','place_type']
  try:
    # call twitter api to fetch tweets
    fetched_tweets = tweepy.Paginator(client.search_recent_tweets, query=query,
      expansions        =expansions,
      tweet_fields      =tweet_fields,
      place_fields      =place_fields,
      user_fields       =user_fields,   
      max_results       =max_results
    ).flatten()
    
    return fetched_tweets
    

  except Exception as e:
    print("Error getting tweets", e)

####  2.2. Extracting Domains and Entities from the Twitter API
*Annotations have been added to the Tweet object from all v2 endpoints that return a Tweet object. Tweet annotations offer a way to understand contextual information about the Tweet itself. Though 100% of Tweets are reviewed, due to the contents of Tweet text, only a portion are annotated.*

##### **Tweet annotation types**
**Entities** Entity annotations are programmatically defined entities that are nested within the entities field and are reflected as annotations in the payload. Each annotation has a confidence score and an indication of where in the Tweet text the entities were identified (start and end fields).

The entity annotations can have the following types:

1. Person - Barack Obama, Daniel, or George W. Bush
2. Place - Detroit, Cali, or "San Francisco, California"
3. Product - Mountain Dew, Mozilla Firefox
4. Organization - Chicago White Sox, IBM
5. Other - Diabetes, Super Bowl 50

**Context annotations** are delivered as a context_annotations field in the payload. These annotations are inferred based on semantic analysis (keywords, hashtags, handles, etc) of the Tweet text and result in domain and/or entity labels. Context annotations can yield one or many domains. At present, we’re using a list of 80+ domains reflected in the table below.  
1. ID - 45: Brand Vertical
2. ID - 46: Brand Category
3. ID - 47: Brand
4. ID - 48: Product

##### 2.2.1 DOMAIN-ENTITY QUERY CONSTRUCTION 
The *search_recent_tweets* function within the Twitter API has a query limit of 512 characters. To work around this, I have created a list of strings, less than 512 characters long, which contain the domain_id.entity_id search query broken up into chunks of 512 characters or less each which I will iterate through when making API requests to retrieve tweets

In [13]:
domain_df = pd.read_csv('twitter-context-annotations/files/evergreen-context-entities-20220601.csv')
import itertools
def automate_domain_filter(df, start_id, end_id, chunk_size, domain_chunk_count):
    chunks_list = []
    for i in range(start_id, end_id+1):
        context_list = []
        mask = df['domains'].str.contains('^{}$'.format(i))
        filtered_df = df[mask]
        for index, row in filtered_df.iterrows():
            domain_id = row['domains']
            entity_id = row['entity_id']
            entity_name = row['entity_name']   
            # construct the query string
            context = f'context:{domain_id}.{entity_id}'
            context_list.append(context)
            context_query = ' OR '.join(context_list)
        code = context_query
        chunks = []
        start = 0
        counter = 0
        while start < len(code) and counter < domain_chunk_count[i]:
            end = start + chunk_size
            if end >= len(code):
                end = len(code)
            end = code.rfind(" OR ", start, end)
            if end == -1:
                end = start + chunk_size
            chunk = code[start:end]
            if chunk.startswith(" OR "):
                chunk = chunk[4:]
            chunks.append(chunk)
            start = end
            counter += 1
        chunks_list.append(chunks)
    return list(itertools.chain.from_iterable(chunks_list))

chunk_size = 350
domain_chunk_count = {45: 1, 46: 6, 47: 276, 48: 69}
chunks_list = automate_domain_filter(domain_df, 45, 48, chunk_size, domain_chunk_count)
# print(chunks_list)

In [9]:
# domain_df = pd.read_csv('twitter-context-annotations/files/evergreen-context-entities-20220601.csv')

##### 2.2.2 Defining Pre-Extraction Filtering 
- [ X ] Language restricted to English 
- [ X ] No Retweets or Quote Retweets Allowed
- [ X ] Filtering for tweets within Domains 45 through 48 (*all entities*)
- [ X ] Tweets must have mentions (*indicates presence of brand/sponsor*)
- [ X ] Hashtag List consisting of indications that the tweet is being promoted or sponsored
- [ ] Possible Entity Names which are irrelevant

##### 2.2.3 Defining Post-Extraction Filtering 
  1. Accounts that have a high ratio of followers to following (e.g., following fewer than 100 accounts but having thousands of followers)
  2. Number of Followers
  1. Accounts that use a large number of hashtags in their tweets (e.g., more than 5 hashtags per tweet).
  2. Accounts that use a lot of capital letters or exclamation points in their tweets.
  3. Accounts that have a high percentage of tweets that contain links (e.g., more than 50% of tweets contain links).   
  5. Using the Botometer API to extract a score for each user that indicates the probabibily of the account being a bot. 

In [14]:
def filter_rule(chunk, hash_include=True):
    text_list       = '(#ad OR #sponsored OR #promoted OR "Learn More" OR "Shop Now")'
    lang            = '(lang:en)'
    rt              = '(-is:retweet) (-"RT")' 
    domain            = chunk
    mention         = 'has:mentions'
    if hash_include == True:
        query           = text_list + ' ' + lang + ' ' + rt + ' ' + mention + ' ' + '(' + domain + ')'
    else: 
        query           = lang + ' ' + rt + ' ' + mention + ' ' + '(' + domain + ')'

    return query

In [15]:
import time
import tweepy

processed_tweets = 0
total_tweets = 0

for chunk in chunks_list:
    print(f'Chunk: {chunk}')
    
    query = filter_rule(chunk=chunk)
    paginator = get_tweets(query=query, max_results=100)

    if paginator is None:
        print('Error: Paginator is None. Skipping chunk.')
        continue
    
    for tweet in paginator:
        try:
            c.execute('''INSERT OR REPLACE INTO tweets 
                         (tweet_id, author_id, created_at, text, tweet_metrics, entities, context, place_id) 
                         VALUES (?, ?, ?, ?, ?, ?, ?, ?)''',
                      (tweet.id, tweet.author_id, tweet.created_at,
                       tweet.text, json.dumps(tweet.public_metrics), 
                       json.dumps(tweet.entities), json.dumps(tweet.context_annotations),
                       json.dumps(tweet.geo) if tweet.geo else None))
            
            processed_tweets += 1
            print(f'Progress: {processed_tweets} tweets processed.')
        
        except tweepy.TweepError as e:
            if e.response and e.response.status_code == 429:
                print('Rate limit exceeded. Pausing for 15 minutes.')
                print(f'Progress: {processed_tweets}/{total_tweets} tweets processed.')
                time.sleep(10 * 60)  # Pause execution for 15 minutes (900 seconds)
            else:
                print(f"An error occurred: {e}")
        
        except tweepy.TooManyRequests as e:
            print('Rate limit exceeded. Pausing for 15 minutes.')
            print(f'Progress: {processed_tweets}/{total_tweets} tweets processed.')
            time.sleep(10 * 60)  # Pause execution for 15 minutes (900 seconds)
        
        except Exception as e:
            print(f"An error occurred: {e}")
        
        total_tweets += 1
        # time.sleep(10*60)  # Add a small delay between requests to avoid hitting rate limits
    
    print(f'Finished processing chunk: {chunk}')
    print(f'Progress: {processed_tweets}/{total_tweets} tweets processed.')
    conn.commit()  # Commit the changes to the database
    time.sleep(60)  # Pause for 5 minutes between chunks to avoid hitting rate limits

conn.close()

Chunk: context:45.781972125171060736 OR context:45.781974597226799105 OR context:45.781974596740190208 OR context:45.781974596161376261 OR context:45.781974597474263040 OR context:45.781974597310615553 OR context:45.781974596157251587
Progress: 1 tweets processed.
Progress: 2 tweets processed.
Progress: 3 tweets processed.
Progress: 4 tweets processed.
Progress: 5 tweets processed.
Progress: 6 tweets processed.
Progress: 7 tweets processed.
Progress: 8 tweets processed.
Progress: 9 tweets processed.
Progress: 10 tweets processed.
Progress: 11 tweets processed.
Progress: 12 tweets processed.
Progress: 13 tweets processed.
Progress: 14 tweets processed.
Progress: 15 tweets processed.
Progress: 16 tweets processed.
Progress: 17 tweets processed.
Progress: 18 tweets processed.
Progress: 19 tweets processed.
Progress: 20 tweets processed.
Progress: 21 tweets processed.
Progress: 22 tweets processed.
Progress: 23 tweets processed.
Progress: 24 tweets processed.
Progress: 25 tweets processed.

In [14]:
c.execute("SELECT COUNT(*) FROM tweets")
row_count = c.fetchone()[0]
print(f"Number of rows in 'tweets' table: {row_count}")

Number of rows in 'tweets' table: 10897


In [12]:
# Re-open the connection to the database
conn = sqlite3.connect('twitter_data.db')
c = conn.cursor()
# Get unique author IDs from the tweets table
c.execute("SELECT DISTINCT author_id FROM tweets")
author_ids = [row[0] for row in c.fetchall()]

user_data = []
batch_size = 100
n = 0
# Iterate over batches of author IDs
for i in range(0, len(author_ids), batch_size):      
        # try:
        users = client.get_users(ids=author_ids[i:i+batch_size], ##initially using api.lookup_users
        user_fields=['id','name','username','created_at','description','entities','location','public_metrics','verified'])  
          
            # Insert the user data into the database
        for user in users.data:
                c.execute("DELETE FROM users WHERE author_id=?", (user.id,))
                c.execute('''INSERT INTO users (author_id, username, verified, bio, author_created, author_location, 
                followers_count, following_count, tweet_count, entities)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''',
                (user.id, user.username, user.verified, user.description, user.created_at,
                user.location, user.public_metrics['followers_count'], user.public_metrics['following_count'],
                user.public_metrics['tweet_count'], json.dumps(user.entities)))

                 # Display the author information being stored in the database
                print(f"Stored author: {user.name} (@{user.username}), id={user.id}")
         
        time.sleep(16)

Stored author: Melee Games (@MeleeGames), id=1631751982729035777
Stored author: 👑🇺🇸🇫🇷VƗŇĆЖỮ_ŁЖŘĐ?¿/ĦΔŘĐĆЖŘ€ǤΔΜ€Ř 💙💚❤️ΔOX❑🔥 (@Vanilla_sky5090), id=4642047857
Stored author: Maverick Immobilier (@MaverickImmobi1), id=1116069571688128513
Stored author: Altair (@Altair95103), id=1576835050297266176
Stored author: Molly Ploofkins™ (@Mollyploofkins), id=1274695868793458689
Stored author: Holiday Shopping (@PGSweeps), id=637372967
Stored author: SpottedBargains (@SpottedBargains), id=2763850618
Stored author: Golden Hana (@GoldenHana4), id=1624027153879732232
Stored author: Bob Is Here To Explain (@ExplainThisBob), id=1525739203341918208
Stored author: PistonsGT (@PistonsGT), id=862659189604265984
Stored author: LLP (she/they) (@LaLadyPanda), id=566622893
Stored author: 😼𝗦𝗛𝗥𝗘𝗗𝗭😸 (@Gato_Pub), id=1145291723305816064
Stored author: Clio Aite 📚⚔️ Vershion (@ClioAite), id=1397875413955710979
Stored author: Json's Amazin Stonks 🇨🇦 (@AmazinJson), id=1610763582282600468
Stored author: amazinwarehouse