In [3]:

####################
#Author: brandon chiazza
#version: 1.0
#purpose: to call a twitter api and return results
#documentation: https://developer.twitter.com/en/docs
#####################

import pandas as pd
import requests
import json
import base64
#!pip install s3fs
import s3fs # documentation: https://s3fs.readthedocs.io/en/latest/
import time
import twitter_keys #this is a custom reference module to a package containing twitter keys

%config IPCompleter.greedy=True

key_secret = '{}:{}'.format(twitter_keys.client_key, twitter_keys.client_secret).encode('ascii')
#key_secret = '{}:{}'.format('123', twitter_keys.client_secret).encode('ascii')
b64_encoded_key = base64.b64encode(key_secret)
b64_encoded_key = b64_encoded_key.decode('ascii')

#identify base url and oauth token path
base_url = 'https://api.twitter.com/' #base url for authentication
auth_url = '{}oauth2/token'.format(base_url)

#share header information -- encoding is ascii
auth_headers = {
    'Authorization': 'Basic {}'.format(b64_encoded_key),
    'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8'
}

#pass clientcredentials
auth_data = {
    'grant_type': 'client_credentials'
}

#send authentication using requests - POST request
auth_resp = requests.post(auth_url, headers=auth_headers, data=auth_data)

#check response status. 200 = OK
auth_resp.status_code




200

In [4]:

#Keys in data response are token_type (bearer) and access_token (your access token)
print(auth_resp.json().keys())

access_token = auth_resp.json()['access_token']


search_headers = {
    'Authorization': 'Bearer {}'.format(access_token)    
}

#enter search parameters for coronavirus example. This looks for "covid-19" in the 1000 most recent tweets
query_params = {
    'q': 'covid-19',
    'result_type': 'recent',
    'count': 100, #update here to get more/less than 1000 returns
    'lang': 'en' #filters by english language only
}


#identify search url path and save 
search_url = '{}1.1/search/tweets.json'.format(base_url)


#run search using get request
search_resp = requests.get(search_url, headers=search_headers, params=query_params)

#check status code of GET request
search_resp.status_code


dict_keys(['token_type', 'access_token'])


200

In [5]:
#print text from result to verify  
twitter_data = search_resp.json()

for x in twitter_data['statuses']:
    print(x['text'] + '\n')
    #break #prints after one iteration and stops, remove break to see all 1000

RT @soompi: #NCT_DOJAEJUNG Cancels This Week’s Promotions After #Jaehyun Experiences COVID-19 Symptoms
https://t.co/eVKT4JtqBX https://t.co…

RT @KanekoaTheGreat: Former DNI John Ratcliffe says the CDC Director, the Secretary of State, and the Director of National Intelligence agr…

RT @sanemiliomps: @sanemiliomps Prevention on Covid-19 https://t.co/9NdduKkvKo

RT @laurieallee: From 2020:
"1/3of patients with COVID-19 ― both active and cleared cases ― showed at least some measure of myocarditis." h…

RT @MakisMD: Vista, CA - 40 year old Pastor Jay Foulk died in his sleep on April 13, 2023.

He was negative for COVID-19, went to bed feeli…

RT @sanemiliomps: @sanemiliomps COVID-19 SAFETY TIPS https://t.co/v7FG6Civ4P

RT @stkirsch: Despite the fact that it will increase your chance of getting Covid, FDA authorizes another shot for older adults and people…

RT @laralogan: These people are either literally insane or pure evil or both?

It’s one thing for ordinary citizens to still be blind

In [6]:
# move data into data frame 
df = pd.DataFrame(twitter_data['statuses'])

# show one record to verify import 
df.head(5)

Unnamed: 0,created_at,id,id_str,text,truncated,entities,metadata,source,in_reply_to_status_id,in_reply_to_status_id_str,...,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,extended_entities,quoted_status_id,quoted_status_id_str,quoted_status
0,Thu Apr 20 00:56:50 +0000 2023,1648853192979955715,1648853192979955715,RT @soompi: #NCT_DOJAEJUNG Cancels This Week’s...,False,"{'hashtags': [{'text': 'NCT_DOJAEJUNG', 'indic...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,203,0,False,False,False,en,,,,
1,Thu Apr 20 00:56:49 +0000 2023,1648853192489459712,1648853192489459712,RT @KanekoaTheGreat: Former DNI John Ratcliffe...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,17045,0,False,False,,en,,,,
2,Thu Apr 20 00:56:47 +0000 2023,1648853181324218368,1648853181324218368,RT @sanemiliomps: @sanemiliomps Prevention on ...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,2,0,False,False,False,en,"{'media': [{'id': 1648176605196603394, 'id_str...",,,
3,Thu Apr 20 00:56:47 +0000 2023,1648853180778954754,1648853180778954754,"RT @laurieallee: From 2020:\n""1/3of patients w...",False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,300,0,False,False,,en,,,,
4,Thu Apr 20 00:56:46 +0000 2023,1648853178018930691,1648853178018930691,"RT @MakisMD: Vista, CA - 40 year old Pastor Ja...",False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,697,0,False,False,,en,,,,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 30 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   created_at                 100 non-null    object 
 1   id                         100 non-null    int64  
 2   id_str                     100 non-null    object 
 3   text                       100 non-null    object 
 4   truncated                  100 non-null    bool   
 5   entities                   100 non-null    object 
 6   metadata                   100 non-null    object 
 7   source                     100 non-null    object 
 8   in_reply_to_status_id      7 non-null      float64
 9   in_reply_to_status_id_str  7 non-null      object 
 10  in_reply_to_user_id        7 non-null      float64
 11  in_reply_to_user_id_str    7 non-null      object 
 12  in_reply_to_screen_name    7 non-null      object 
 13  user                       100 non-null    object 


In [30]:
import boto3
from io import StringIO
bucket_name = 'lab-03'
aws_s3_client = boto3.client('s3',
         aws_access_key_id='AKIAQTX37VZOZMCNEMWF',
         aws_secret_access_key= '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')

In [33]:
def upload_s3(df,i):
    global aws_s3_client,bucket_name
    csv_buf = StringIO()
    df.to_csv(csv_buf,header=True,index=False)
    csv_buf.seek(0)
    aws_s3_client.put_object(Bucket=bucket_name,Body=csv_buf.getvalue(),Key=i)

# # Check if the bucket exists
# response = aws_s3_client.list_buckets()
# buckets = [bucket['Name'] for bucket in response['Buckets']]
# if bucket_name not in buckets:
#     print(f"{bucket_name} bucket does not exist.")
# else:
upload_s3(df, 'lab-03_Group7_04202023.csv')

In [34]:
# Get the objects in the S3 bucket
try:
    response = aws_s3_client.list_objects_v2(Bucket=bucket_name)
    if 'Contents' in response:
        # Print the name and size of each object in the bucket
        for obj in response['Contents']:
            print(f"{obj['Key']} - {obj['Size']} bytes")
    else:
        print(f"The bucket {bucket_name} is empty.")
except Exception as e:
    print(f"Error listing objects in bucket {bucket_name}.")
    print(e)

2022-spring-Group5_20220406220036.csv - 481896 bytes
Group4-Lab320220405195641.csv - 372403 bytes
Group_1_20220331201922.csv - 448076 bytes
Group_1_20220405195545.csv - 462455 bytes
Group_1_20220405195721.csv - 496262 bytes
Group_1_20220406191330.csv - 475513 bytes
Group_1_20230419213518.csv - 40611 bytes
Group_1_S20220327174953.csv - 458645 bytes
Group_3_20220406191456.csv - 487525 bytes
Group_Xiaolan_20210724.csv - 542269 bytes
Group_Y20220403101253.csv - 492660 bytes
Group_teli_20220402.csv - 499374 bytes
lab-03_Group7_04202023.csv - 488960 bytes
lab-3Group7.csv - 488960 bytes
