In [104]:
import requests 
import json 
import tempfile
import pathlib 
from datetime import datetime as dt
from uuid import uuid4
from requests_oauthlib import OAuth1Session
import pandas as pd
import os
import urllib
from os import path

# imports the twitter_secrets python file in which we store the twitter API keys
from twitter_secrets import twitter_secrets as ts



# the write path in which the data will be stored. If it does not yet exist, it will be created
out_path = "/twitter/output/"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

# establish the connection by providing the twitter API keys from the twitter file
twitter = OAuth1Session(
    client_key=ts.CONSUMER_KEY,
    client_secret=ts.CONSUMER_SECRET,
    resource_owner_key=ts.ACCESS_TOKEN,
    resource_owner_secret=ts.ACCESS_SECRET)

# 01 Retrieving Tweets by Searchtag

In [169]:
# the max number of tweets that will be returned
max_results = 10

# the hashtag or phrase to fetch the tweets for
searchtag = "2021"

# define the query data
query_data = {
    "track": f"{searchtag}".replace(" ", "").lower(),
    "language": "en", # the language to use
    "date_since": "2019-12-01" # retrieve only tweets after this date
}

# the twitter API url (version 1.1)
url = "https://stream.twitter.com/1.1/statuses/filter.json"

# adds the query data to the url
query_url = f"{url}?{'&'.join([f'{k}={v}' for k, v in query_data.items()])}"
print(query_url)

data = []
print(f"Retrieving max {max_results} Tweets:")
with twitter.get(query_url, stream=True) as response:
    for i, raw_tweet in enumerate(response.iter_lines()):
        if i == max_results:
            break
        if raw_tweet != "b''":     
            try:
                tweet = json.loads(raw_tweet)
                userid = tweet['id']
                user = tweet['user']
                username = user['screen_name']
                userlocation = user['location']
                created_at = tweet['created_at']
                text = tweet['text']
                data.append([userid, username, userlocation, created_at, text])
                #print(f"{i+1}/{max_results}: {user}\n @ {created_at }\n: {text}\n")

            except (json.JSONDecodeError, KeyError) as err:
                # In case the JSON fails to decode, we skip this tweet
                print(f"{i+1}/{max_results}: ERROR: encountered a problem with a line of data... \n")
                continue

            # write the json file to disk
            with pathlib.Path(out_path) / f"{dt.now().timestamp()}_{uuid4()}.json" as F:
                F.write_bytes(raw_tweet)
            
df = pd.DataFrame (data, columns = ['userid', 'username', 'userlocation', 'created_at','text'])
df

Retrieving max 5 Tweets:


Unnamed: 0,userid,username,userlocation,created_at,text
0,1345066146647699461,adevour,,Fri Jan 01 17:55:34 +0000 2021,RT @zikora_o: didn't show my body enough this ...
1,1345066146270126081,amoreshawmila,"IND, +62",Fri Jan 01 17:55:34 +0000 2021,not blank 😭 https://t.co/oOaRyC41kM
2,1345066146735640582,taimoor_3,Australia,Fri Jan 01 17:55:34 +0000 2021,RT @Nintendeal: 2021 is a big year for Nintend...
3,1345066146693840896,hallelujahbea,@feelslikehosie,Fri Jan 01 17:55:34 +0000 2021,"august, champagne problems, and cardigan"
4,1345066146706370560,ChrifAlidrissi,fes city .morocco,Fri Jan 01 17:55:34 +0000 2021,"@BBCNews Your Honor, the President of Argentin..."


# 02 Retrieving Tweets from a specific user

In [152]:
# the max number of tweets that will be returned
max_results = 10

# define the query data
screen_name = "Barack Obama"

query_data = {
    "screen_name": f"{screen_name}".replace(" ", "").lower(),
    "date_since": "2019-12-01" # retrieve only tweets after this date
}

# the twitter API url (version 1.1)
url = "https://api.twitter.com/1.1/statuses/user_timeline.json"

# adds the query data to the url
query_url = f"{url}?{'&'.join([f'{k}={v}' for k, v in query_data.items()])}"
print(query_url)

data = []
print(f"Retrieving max {max_results} Tweets:")
with twitter.get(query_url, stream=True) as response:
    for i, raw_tweet in enumerate(response.iter_lines()):
        if i == max_results:
            break
        if raw_tweet != "b''":     
            try:
                tweets = json.loads(raw_tweet)
                for tweet in tweets: 
                    #print(str(tweet))
                    created_at = tweet['created_at']
                    text = tweet['text']
                    retweet_count = tweet['retweet_count']
                    favorite_count = tweet['favorite_count']
                    user = tweet['user']
                    username = user['screen_name']
                    data.append([screen_name, created_at, retweet_count, favorite_count, text])

            except (json.JSONDecodeError, KeyError) as err:
                # In case the JSON fails to decode, we skip this tweet
                print(f"{i+1}/{max_results}: ERROR: encountered a problem with a line of data... \n")
                continue

            # write the json file to disk
            with pathlib.Path(out_path) / f"{dt.now().timestamp()}_{uuid4()}.json" as F:
                F.write_bytes(raw_tweet)
                
df = pd.DataFrame (data, columns = ['screen_name', 'created_at', 'retweet_count', 'favorite_count', 'text'])
df

Retrieving max 10 Tweets:


TypeError: string indices must be integers

# 03 Retrieving Images from Twitter

In [161]:
# the max number of tweets that will be returned
max_results = 20

# the hashtag or phrase to fetch the tweets for
searchtag = "chart"

# define the query data
query_data = {
    "track": f"{searchtag}".replace(" ", "").lower(),
    "language": "en", # the language to use" 
    "has": "media"
}
# the twitter API url (version 1.1)
url = "https://stream.twitter.com/1.1/statuses/filter.json"

# adds the query data to the url
query_url = f"{url}?{'&'.join([f'{k}={v}' for k, v in query_data.items()])}"
print(query_url)

def defineFolderName(searchtag):
    # ddmmYY-HMS
    now = dt.now()
    dt_string = now.strftime("%d%m%Y-%H%M%S")
    
    if len(searchtag) > 20:
        return dt_string
    else:
        return dt_string + "-" + searchtag

def createDir(savepath):
    try:
        os.makedirs(savepath)
    except OSError:
        print ("Creation of the directory %s failed" % savepath)
        if path.exists(savepath):
            print("path exists")
    else:
        print ("Successfully created the directory %s " % savepath)

def save_images(savepath, tweet, i):
    z = 1
    print(tweet)
    entities = tweet['entities']
    tweet_url = tweet['id']
    created_at = tweet['created_at']
    tweet_media = entities['media']
    for media in tweet_media: # a tweet can have multiple images/videos
        media_url = str(media['media_url_https'])
        file_name = media['id']
        try:
            pic = urllib.request.urlopen(media_url)
            file_path = savepath + "/" + searchtag + str(i) + "-" + str(z) + ".jpg"
            with open(file_path, 'wb') as localFile:
                localFile.write(pic.read())
            z += 1
            #save image origin info
            data.append([tweet_url, created_at, file_name, media_url])
        except Exception as e:
            print('exception at counter ' + str(counter))
    
# detect the current working directory and print it
base_path = os.getcwd()
print ("The current working directory is %s" % base_path)
img_dir = '/twitter/downloaded_media/'
data = []

# the write path in which the data will be stored. If it does not yet exist, it will be created
file_path = base_path + img_dir

#createDir(file_path)
unique_folder_path = file_path + defineFolderName(searchtag)
createDir(unique_folder_path) 

print(f"Retrieving a total max of {max_results} Tweets:")
with twitter.get(query_url, stream=True) as response:
    for i, raw_tweet in enumerate(response.iter_lines()):
        if i == max_results:
            break
        try:
            tweet = json.loads(raw_tweet)
            save_images(unique_folder_path, tweet, i)
            
        except (json.JSONDecodeError, KeyError) as err:
            # In case the JSON fails to decode, we skip this tweet
            print(f"{i+1}/{max_results}: ERROR: encountered a problem with a line of data... ")
            continue

image_df = pd.DataFrame (data, columns = ['created_at', 'tweet_id', 'media_id', 'media_url'])
image_df

https://stream.twitter.com/1.1/statuses/filter.json?track=chart&language=en&has=media
The current working directory is C:\Users\Flo\relataly-public-python-tutorials
Successfully created the directory C:\Users\Flo\relataly-public-python-tutorials/twitter/downloaded_media/02012021-010959-chart 
Retrieving a total max of 20 Tweets:
{'created_at': 'Sat Jan 02 00:09:57 +0000 2021', 'id': 1345160363873079296, 'id_str': '1345160363873079296', 'text': 'RT @LilySimpson1312: The Cancel Culture alignment chart https://t.co/euiRD6JxXz', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1053020449666404352, 'id_str': '1053020449666404352', 'name': 'Belfegordo', 'screen_name': 'belfegordo', 'location': 'Mi sofá. Infierno.', 'url': None, 'description': 'Demonio

{'created_at': 'Sat Jan 02 00:10:00 +0000 2021', 'id': 1345160376514703360, 'id_str': '1345160376514703360', 'text': '@palmerwilliamj @sadie_shatterly @StewieG24665333 @vegasbarbie7777 Ohhhh that is Salty Dog! I did not know that. I… https://t.co/qU3PovV24I', 'display_text_range': [67, 140], 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'truncated': True, 'in_reply_to_status_id': 1345159915577495560, 'in_reply_to_status_id_str': '1345159915577495560', 'in_reply_to_user_id': 2178859262, 'in_reply_to_user_id_str': '2178859262', 'in_reply_to_screen_name': 'palmerwilliamj', 'user': {'id': 1298982719133634562, 'id_str': '1298982719133634562', 'name': 'Diesel Cat', 'screen_name': '_D724964697', 'location': None, 'url': None, 'description': "Married, mom of 2 furry kids. My dm's are open, and they stay private. Be kind and always set boundaries.", 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 321, 'friends_count': 4

{'created_at': 'Sat Jan 02 00:10:01 +0000 2021', 'id': 1345160383493976071, 'id_str': '1345160383493976071', 'text': 'RT @Covid19DataUK: Bed occupancy is clearly below average for this time of year - and has been since the start of the pandemic - so why the…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 880836775920500736, 'id_str': '880836775920500736', 'name': 'sofia fachadas', 'screen_name': 'sofia_fachadas', 'location': 'Cardiff, Wales', 'url': None, 'description': None, 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 8, 'friends_count': 103, 'listed_count': 0, 'favourites_count': 401, 'statuses_count': 51, 'created_at': 'Fri Jun 30 17:13:47 +0000 2017', 'utc_offset': None, 'time_zone': None, 'geo_en

{'created_at': 'Sat Jan 02 00:10:03 +0000 2021', 'id': 1345160389936480256, 'id_str': '1345160389936480256', 'text': '@Vanillatime619 @Safety_Koos @Dan_Hope Day has him as RB1 on the chart', 'display_text_range': [39, 70], 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': 1345150599008931840, 'in_reply_to_status_id_str': '1345150599008931840', 'in_reply_to_user_id': 4751956196, 'in_reply_to_user_id_str': '4751956196', 'in_reply_to_screen_name': 'Vanillatime619', 'user': {'id': 1060599967650320384, 'id_str': '1060599967650320384', 'name': 'Jack McMultry', 'screen_name': 'Patriot775', 'location': 'Monticello or The Boat', 'url': None, 'description': '7Society 2A Forgiven & Free. Know a thing or 2, because I’ve seen a thing or 2. I’m Jack, a Fanatic for Buckeye Football', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 136, 'friends_count': 615, 'listed_count': 0

{'created_at': 'Sat Jan 02 00:10:05 +0000 2021', 'id': 1345160396420866048, 'id_str': '1345160396420866048', 'text': 'RT @BLACKPINKSTATS5: Best Selling Female Group Albums Per Year on Hanteo Year-end chart:\n\n2020: THE ALBUM @BLACKPINK \n2019: Feel Special\n20…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1065028681633067008, 'id_str': '1065028681633067008', 'name': 'Keny Diu Toscano', 'screen_name': 'DiuKeny', 'location': None, 'url': None, 'description': None, 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 53, 'friends_count': 289, 'listed_count': 0, 'favourites_count': 61460, 'statuses_count': 57134, 'created_at': 'Tue Nov 20 23:46:39 +0000 2018', 'utc_offset': None, 'time_zone': None, 'geo_enabled

{'created_at': 'Sat Jan 02 00:10:10 +0000 2021', 'id': 1345160418377961472, 'id_str': '1345160418377961472', 'text': 'RT @Jungkook_Nepal: The most popular Korean celebrities in China this month\nAccording to the Monthly AiMan Index Chart, these are the top 1…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1032578459573178369, 'id_str': '1032578459573178369', 'name': 'KTH1 JJK1 ≮ 💜', 'screen_name': 'btstaekook0405', 'location': 'TaeKook World', 'url': None, 'description': 'we felt incomplete without you (refering to Taehyung) - Jungkook', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 498, 'friends_count': 1265, 'listed_count': 5, 'favourites_count': 283890, 'statuses_count': 259995, 'created_at': 'Thu Au

{'created_at': 'Sat Jan 02 00:10:13 +0000 2021', 'id': 1345160431015440386, 'id_str': '1345160431015440386', 'text': 'RT @ChartsGOT7: On the 22nd Breath Of Love will be shipped to US ahgases. All the sales will count towards Billboard and as far as we under…', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1146601070279430144, 'id_str': '1146601070279430144', 'name': 'Mark and Jacks One out of a million | ♡༄⁷ ᶠᵒʳ ⁷', 'screen_name': 'gotmarksoned', 'location': 'Bi,cc and carrd under pinned ', 'url': 'https://gotmarksoned.carrd.co/', 'description': '★(I’m ND)| Marky/Jacky / hyung line biased .7 or nothing ,7 or never She/Her, Native/Indig - Shawnee ( got7 n shinee mostly )', 'translator_type': 'none', 'protected': False, 'verified': False, 'follow

{'created_at': 'Sat Jan 02 00:10:15 +0000 2021', 'id': 1345160441400442880, 'id_str': '1345160441400442880', 'text': 'RT @TSwiftFTC: 📈 Taylor Swift\'s "Lover" (2019) is currently the 7th Highest charting female album entering Top 35 on US iTunes Albums chart…', 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 1117807081384321024, 'id_str': '1117807081384321024', 'name': 'ambrossjorda', 'screen_name': 'ambrossjorda', 'location': 'Angono Rizal, Rizal Region', 'url': None, 'description': "I come back stronger than a 90's trend\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nhttp://facebook.com/ambrossjorda             \n\nhttp://Instagram.com/ambrossjorda", 'translator_type': 'none', 'protected': False, 'verified'

{'created_at': 'Sat Jan 02 00:10:16 +0000 2021', 'id': 1345160445682921474, 'id_str': '1345160445682921474', 'text': 'RT @Schuldensuehner: Biggest monetary experiment in economic history: Combined balance sheet of G10 CenBanks has topped 50% of G10 GDP due…', 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 9933702, 'id_str': '9933702', 'name': '☀️💧Leon-Gerard Vandenberg 🇨🇦🇦🇺', 'screen_name': 'Leon_Vandenberg', 'location': 'Toronto-Australia-Netherlands ', 'url': 'http://linkedin.com/in/leongvandenberg', 'description': 'CEO Systems Design Engineer+DAD #Blockchain #Wireless @SunifiedEnergy #eSIM #unbanked #SolarPunk #BioGenomics🧬@GMDxCo @Fuzo #BigData #IoT #ML #AI🌞https://t.co/67fdoni0fj', 'translator_type': 'none', 'protected': False, 'verified': 

Unnamed: 0,created_at,tweet_id,media_id,media_url
0,1345160363873079296,Sat Jan 02 00:09:57 +0000 2021,1344538675955126272,https://pbs.twimg.com/media/EqjCzVcUUAAhcNg.jpg
1,1345160378615926784,Sat Jan 02 00:10:00 +0000 2021,1345159268337659904,https://pbs.twimg.com/media/Eqr3OkOXMAAilXr.jpg
2,1345160383775072256,Sat Jan 02 00:10:02 +0000 2021,1345000632911069189,https://pbs.twimg.com/media/Eqpm8xNXAAUDNop.jpg
