# Tweepy

In [1]:
# import packages
import requests
import json
import time
import random
import os
import pandas as pd

In [2]:
# authentication
bearer_token = os.environ.get('X_BEARER_TOKEN')

In [3]:
# API endpoint
endpoint_url = "https://api.twitter.com/2/tweets/search/recent"

In [4]:
# number of requests
max_results = 100

---

## Queries

#### Uncomment single topic to scrape

In [5]:
# table_name = 'bible'
# label = 2
# query = """

# "bible" OR "biblical" OR "holy" OR "holiness" OR "revelation" OR "scrolls" OR "scripture" OR "bible verse"

# """ 

In [6]:
# table_name = 'current_events'
# label = 3
# query = """

# "current events" OR "breaking news" OR "live events" OR "political" OR "government" OR "world news" OR "president"

# """ 

In [7]:
# table_name = 'fellowship'
# label = 4
# query = """

# "fellowship" OR "communion" OR "exchanges" OR "unity" OR "oneness" OR "harmony"

# """ 

In [8]:
# table_name = 'Jerusalem'
# label = 5
# query = """

# "Jerusalem" OR "Holy City" OR "Zion" OR "The City of David" OR "The Promised Land" OR "The Holy Land" OR "The City of Peace" OR "The Eternal City" OR "The City of Gold"

# """ 

In [9]:
# table_name = 'Jesus'
# label = 6
# query = """

# "Jesus" OR "Holy Spirit" OR "God" OR "Spirit of God" OR "Lion" OR "Lord" OR "Jehovah"

# """ 

In [10]:
# table_name = 'Kingdom'
# label = 7
# query = """

# "Heaven" OR "Kingdom" OR "Paradise" OR "Kingdom of God" OR "The Promised Land" OR "Eden"

# """ 

In [11]:
# table_name = 'prayer'
# label = 1
# query = """

# "prayer" OR "intercession" OR "depth" OR "holiness" OR "intimacy" OR "praying" OR "prayer request"

# """ 

In [12]:
# table_name = 'prophesy'
# label = 8
# query = """

# "prophesy" OR "encouragement" OR "testimony" OR "prophetic word" OR "interpretation"

# """ 

In [13]:
# table_name = 'salvation'
# label = 9
# query = """

# "evangelism" OR "trumpet" OR "salvation" OR "touching lives" OR "reaching the nations" OR "harvest" OR "sowing" OR "repentance" OR "change of heart" OR "turn away from sin"

# """ 

In [14]:
# table_name = 'worship'
# label = 10
# query = """

# "worship" OR "praise" OR "adoration" OR "devotion" OR "reverence" OR "worship music" OR "worship leader"

# """ 

---

## Save Data

In [15]:
# additional params
query_parameters = {
    "query": f'({query}) lang:en -is:retweet',
    "tweet.fields": "id,text,author_id,created_at",
    "max_results": max_results,
}

In [16]:
# headers
def request_headers(bearer_token: str) -> dict:
    """
    Sets up the request headers. 
    Returns a dictionary summarising the bearer token authentication details.
    """
    return {"Authorization": f"Bearer {bearer_token}"}

headers = request_headers(bearer_token)

In [17]:
# connect to endpoint

def connect_to_endpoint(endpoint_url: str, headers: dict, parameters: dict) -> json:
    """
    Connects to the endpoint and requests data.
    Returns a json with Twitter data if a 200 status code is yielded.
    Programme stops if there is a problem with the request and sleeps
    if there is a temporary problem accessing the endpoint.
    """
    response = requests.request(
        "GET", url=endpoint_url, headers=headers, params=parameters
    )
    response_status_code = response.status_code
    if response_status_code != 200:
        if response_status_code >= 400 and response_status_code < 500:
            raise Exception(
                f"Cannot get data, the program will stop!\nHTTP {response_status_code}: {response.text}"
            )
        
        sleep_seconds = random.randint(5, 60)
        print(
            f"Cannot get data, your program will sleep for {sleep_seconds} seconds...\nHTTP { response_status_code}: {response.text}"
        )
        time.sleep(sleep_seconds)
        return connect_to_endpoint(endpoint_url, headers, parameters)
    return response.json()

In [18]:
# capture data
json_response = connect_to_endpoint(endpoint_url, headers, query_parameters)

In [19]:
# # get type
# type(json_response)

In [20]:
# # get keys
# json_response.keys()

In [21]:
# # view metadata
# json_response["meta"]

In [22]:
# # length of data
# len(json_response["data"])

In [23]:
# # print tweet content
# for i in json_response["data"]:
#     print('\n')
#     print(i['text'])

In [24]:
# # Regular expressions for matching @user mentions and URLs
# user_mention_regex = r'@([A-Za-z0-9_]+)'
# url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

# # Loop through each tweet in the JSON response
# for tweet in json_response["data"]:
#     # Replace @user mentions with a placeholder string
#     text = re.sub(user_mention_regex, '{{USERNAME}}', tweet['text'])
    
#     # Replace URLs with a placeholder string
#     text = re.sub(url_regex, '{{URL}}', text)
    
#     # Print the masked tweet text
#     print('\n')
#     print(text)

In [26]:
# print(f"Text: {tweet['text']}")
# print(f"Date: {tweet['created_at']}") # need to format

# print(f"Label: PLACEHOLDER")
# print(f"Lable Name: PLACEHOLDER")
# print(f"ID: PLACEHOLDER")

In [27]:
# Create a DataFrame from the dictionary
df = pd.DataFrame(json_response["data"])


df = df.rename(columns={'created_at': 'date'})

df.drop(['author_id', 'edit_history_tweet_ids'], axis=1, inplace=True)

df['label'] = label

df['label_name'] = table_name

# Convert datetime string to pandas datetime object
df['date'] = pd.to_datetime(df['date'])

# Extract date part
df['date'] = df['date'].dt.date

# Convert date object back to string if needed
df['date'] = df['date'].astype(str)

df = df[['text', 'date', 'label', 'label_name', 'id']]

# Display the DataFrame
df.head(10)

Unnamed: 0,text,date,label,label_name,id
0,All religions must be subservient to the will ...,2024-04-29,4,fellowship,1785029802770174386
1,Countdown to Bitcoin Halving: A Momentous Even...,2024-04-29,4,fellowship,1785029796205854862
2,Day 15 of building every day in public until I...,2024-04-29,4,fellowship,1785029773078749351
3,On Campus Action and the Unity of the Fields (...,2024-04-29,4,fellowship,1785029762760716712
4,"Arise. Unity self-expression, we want our chil...",2024-04-29,4,fellowship,1785029727801012682
5,@Sakowitz___ it's made in Unity!,2024-04-29,4,fellowship,1785029660075810973
6,"""Harmony of Hope: A Patriotic Song for a Unite...",2024-04-29,4,fellowship,1785029610414985724
7,"""Harmony of Hope: A Patriotic Song for a Unite...",2024-04-29,4,fellowship,1785029599153242273
8,@backdoc47 @Tomas_Menos @Burgess7281975 Based ...,2024-04-29,4,fellowship,1785029596007567434
9,@JonesAlber66112 Harmony,2024-04-29,4,fellowship,1785029578819506407


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        99 non-null     object
 1   date        99 non-null     object
 2   label       99 non-null     int64 
 3   label_name  99 non-null     object
 4   id          99 non-null     object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB


In [29]:
import re

# Regular expression pattern for matching @user mentions and URLs
mention_url_regex = r'(@[A-Za-z0-9_]+|http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)'

# Function to replace @user mentions, URLs, and newlines with placeholders
def replace_mentions_urls(tweet):
    # Replace @user mentions and URLs with placeholders
    tweet = re.sub(mention_url_regex, lambda match: '{{USERNAME}}' if match.group(1).startswith('@') else '{{URL}}', tweet)
    
    # Replace newlines with tabs
    tweet = tweet.replace('\n', '\t')
    
    return tweet

# Apply the function to each row in the 'text' column
df['text'] = df['text'].apply(replace_mentions_urls)
df = df.drop_duplicates()

In [30]:
# df.head(10)

In [31]:
# df.text[1]

In [32]:
df.head(10)

Unnamed: 0,text,date,label,label_name,id
0,All religions must be subservient to the will ...,2024-04-29,4,fellowship,1785029802770174386
1,Countdown to Bitcoin Halving: A Momentous Even...,2024-04-29,4,fellowship,1785029796205854862
2,Day 15 of building every day in public until I...,2024-04-29,4,fellowship,1785029773078749351
3,On Campus Action and the Unity of the Fields (...,2024-04-29,4,fellowship,1785029762760716712
4,"Arise. Unity self-expression, we want our chil...",2024-04-29,4,fellowship,1785029727801012682
5,{{USERNAME}} it's made in Unity!,2024-04-29,4,fellowship,1785029660075810973
6,"""Harmony of Hope: A Patriotic Song for a Unite...",2024-04-29,4,fellowship,1785029610414985724
7,"""Harmony of Hope: A Patriotic Song for a Unite...",2024-04-29,4,fellowship,1785029599153242273
8,{{USERNAME}} {{USERNAME}} {{USERNAME}} Based o...,2024-04-29,4,fellowship,1785029596007567434
9,{{USERNAME}} Harmony,2024-04-29,4,fellowship,1785029578819506407


In [33]:
# for i in df.text:
#     print(i)

In [34]:
def add_dataframe_to_csv(dataframe, csv_file):
    # Load existing CSV file into DataFrame if it exists
    try:
        existing_df = pd.read_csv(csv_file)
    except FileNotFoundError:
        existing_df = pd.DataFrame()

    # Concatenate new DataFrame with existing DataFrame
    updated_df = pd.concat([existing_df, dataframe], ignore_index=True)

    # Save updated DataFrame to CSV file
    updated_df.to_csv(csv_file, index=False)


In [35]:
add_dataframe_to_csv(df, f'./Data/{table_name}.csv')

# Drop Duplicates

In [36]:
df = pd.read_csv(f'data/{table_name}.csv')
df.shape

(1499, 5)

In [37]:
df = df.drop_duplicates()
df.shape

(1499, 5)

In [38]:
df.to_csv(f'data/{table_name}.csv', index=False)