In [2]:
import requests 
import json 
import tempfile
import pathlib 
from datetime import datetime as dt
from uuid import uuid4
from requests_oauthlib import OAuth1Session
import pandas as pd
import os
import urllib
from os import path

# imports the twitter_secrets python file in which we store the twitter API keys
from twitter_secrets import twitter_secrets as ts



# the write path in which the data will be stored. If it does not yet exist, it will be created
out_path = "/twitter/output/"
pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)

# establish the connection by providing the twitter API keys from the twitter file
twitter = OAuth1Session(
    client_key=ts.CONSUMER_KEY,
    client_secret=ts.CONSUMER_SECRET,
    resource_owner_key=ts.ACCESS_TOKEN,
    resource_owner_secret=ts.ACCESS_SECRET)

# 01 Retrieving Tweets by Searchtag

In [3]:
# the max number of tweets that will be returned
max_results = 10

# the hashtag or phrase to fetch the tweets for
searchtag = "2021"

# define the query data
query_data = {
    "track": f"{searchtag}".replace(" ", "").lower(),
    "language": "en", # the language to use
    "date_since": "2019-12-01" # retrieve only tweets after this date
}

# the twitter API url (version 1.1)
url = "https://stream.twitter.com/1.1/statuses/filter.json"

# adds the query data to the url
query_url = f"{url}?{'&'.join([f'{k}={v}' for k, v in query_data.items()])}"
print(query_url)

data = []
print(f"Retrieving max {max_results} Tweets:")
with twitter.get(query_url, stream=True) as response:
    for i, raw_tweet in enumerate(response.iter_lines()):
        if i == max_results:
            break
        if raw_tweet != "b''":     
            try:
                tweet = json.loads(raw_tweet)
                userid = tweet['id']
                user = tweet['user']
                username = user['screen_name']
                userlocation = user['location']
                created_at = tweet['created_at']
                text = tweet['text']
                data.append([userid, username, userlocation, created_at, text])
                #print(f"{i+1}/{max_results}: {user}\n @ {created_at }\n: {text}\n")

            except (json.JSONDecodeError, KeyError) as err:
                # In case the JSON fails to decode, we skip this tweet
                print(f"{i+1}/{max_results}: ERROR: encountered a problem with a line of data... \n")
                continue

            # write the json file to disk
            with pathlib.Path(out_path) / f"{dt.now().timestamp()}_{uuid4()}.json" as F:
                F.write_bytes(raw_tweet)
            
df = pd.DataFrame (data, columns = ['userid', 'username', 'userlocation', 'created_at','text'])
df

https://stream.twitter.com/1.1/statuses/filter.json?track=2021&language=en&date_since=2019-12-01
Retrieving max 10 Tweets:


Unnamed: 0,userid,username,userlocation,created_at,text
0,1345400537156616194,classicwhiskey,Yavin IV,Sat Jan 02 16:04:19 +0000 2021,"ask yoni\n\nIdk what that means, but okay XD"
1,1345400537164857344,ChucklingBears,,Sat Jan 02 16:04:19 +0000 2021,Finally! The word is out. So much has been hap...
2,1345400537320095745,Lastone020501,,Sat Jan 02 16:04:19 +0000 2021,RT @Raze0013: ☀️🐶#MusicBNK48 ✨✨\nHappy New Yea...
3,1345400537248899072,dasGielchen,Germany,Sat Jan 02 16:04:19 +0000 2021,RT @Lindatiny21: ATINY'S VOTE FOR TODAY! Give...
4,1345400537311825923,sherijr,Baltimore MD Citizen 2012,Sat Jan 02 16:04:19 +0000 2021,RT @BoycottUtah: Fellow citizens of Georgia. Y...
5,1345400537345265665,kvetchup,"London, UK to Los Angeles, CA",Sat Jan 02 16:04:19 +0000 2021,@qorquiq @nicholestrano Jacket over game pajam...
6,1345400537328414720,fxyxwxe,🍷•🧀•🍞•🦪•🍋•🧂•🍕•🍟•🍦•🥝,Sat Jan 02 16:04:19 +0000 2021,"RT @asdfghjunyeol: It's 2021, but my heart sti..."
7,1345400537395691523,tarungarg87,,Sat Jan 02 16:04:19 +0000 2021,RT @FaheemYounus: Should I continue my Twitter...
8,1345400537626402816,BurauFred,Alabama,Sat Jan 02 16:04:19 +0000 2021,"RT @RudyGiuliani: Sunday January 3, 2021, Dr. ..."
9,1345400537584431104,TheBenduPodcast,"Austin, TX",Sat Jan 02 16:04:19 +0000 2021,RT @sw_holocron: REVIEW: Star Wars The High Re...


# 02 Retrieving Tweets from a specific user

In [4]:
# the max number of tweets that will be returned
max_results = 10

# define the query data
screen_name = "Barack Obama"

query_data = {
    "screen_name": f"{screen_name}".replace(" ", "").lower(),
    "date_since": "2019-12-01" # retrieve only tweets after this date
}

# the twitter API url (version 1.1)
url = "https://api.twitter.com/1.1/statuses/user_timeline.json"

# adds the query data to the url
query_url = f"{url}?{'&'.join([f'{k}={v}' for k, v in query_data.items()])}"
print(query_url)

data = []
print(f"Retrieving max {max_results} Tweets:")
with twitter.get(query_url, stream=True) as response:
    for i, raw_tweet in enumerate(response.iter_lines()):
        if i == max_results:
            break
        if raw_tweet != "b''":     
            try:
                tweets = json.loads(raw_tweet)
                for tweet in tweets: 
                    #print(str(tweet))
                    created_at = tweet['created_at']
                    text = tweet['text']
                    retweet_count = tweet['retweet_count']
                    favorite_count = tweet['favorite_count']
                    user = tweet['user']
                    username = user['screen_name']
                    data.append([screen_name, created_at, retweet_count, favorite_count, text])

            except (json.JSONDecodeError, KeyError) as err:
                # In case the JSON fails to decode, we skip this tweet
                print(f"{i+1}/{max_results}: ERROR: encountered a problem with a line of data... \n")
                continue

            # write the json file to disk
            with pathlib.Path(out_path) / f"{dt.now().timestamp()}_{uuid4()}.json" as F:
                F.write_bytes(raw_tweet)
                
df = pd.DataFrame (data, columns = ['screen_name', 'created_at', 'retweet_count', 'favorite_count', 'text'])
df

https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=barackobama&date_since=2019-12-01
Retrieving max 10 Tweets:


Unnamed: 0,screen_name,created_at,retweet_count,favorite_count,text
0,Barack Obama,Fri Jan 01 18:25:46 +0000 2021,2244,21721,And here’s a story that reminds us of the powe...
1,Barack Obama,Fri Jan 01 18:25:45 +0000 2021,26121,290527,After a year that has tested us in unimaginabl...
2,Barack Obama,Tue Dec 29 19:11:57 +0000 2020,8443,43484,We’re just one week away from the U.S. Senate ...
3,Barack Obama,Tue Dec 29 17:10:10 +0000 2020,2217,11630,The redistricting process in 2021 will be a sn...
4,Barack Obama,Fri Dec 25 00:01:40 +0000 2020,11809,171442,This Christmas looks different for all of us. ...
5,Barack Obama,Tue Dec 22 18:32:11 +0000 2020,4872,30478,It’s unconscionable that there are families wo...
6,Barack Obama,Mon Dec 21 22:30:31 +0000 2020,23703,230213,"With COVID cases surging worse than ever, gett..."
7,Barack Obama,Sat Dec 19 20:01:04 +0000 2020,33242,286359,Here are some of my favorite songs of the year...
8,Barack Obama,Fri Dec 18 14:01:54 +0000 2020,10629,127740,"Like everyone else, we were stuck inside a lot..."
9,Barack Obama,Thu Dec 17 23:01:50 +0000 2020,2220,17275,"From Willie Mays to Mamie Johnson, the players..."


# 03 Retrieving Images from Twitter

In [6]:
# the max number of tweets that will be returned
max_results = 20

# the hashtag or phrase to fetch the tweets for
searchtag = "chart"

# define the query data
query_data = {
    "track": f"{searchtag}".replace(" ", "").lower(),
    "language": "en", # the language to use" 
    "has": "media"
}
# the twitter API url (version 1.1)
url = "https://stream.twitter.com/1.1/statuses/filter.json"

# adds the query data to the url
query_url = f"{url}?{'&'.join([f'{k}={v}' for k, v in query_data.items()])}"
print(query_url)

def defineFolderName(searchtag):
    # ddmmYY-HMS
    now = dt.now()
    dt_string = now.strftime("%d%m%Y-%H%M%S")
    
    if len(searchtag) > 20:
        return dt_string
    else:
        return dt_string + "-" + searchtag

def createDir(savepath):
    try:
        os.makedirs(savepath)
    except OSError:
        print ("Creation of the directory %s failed" % savepath)
        if path.exists(savepath):
            print("path exists")
    else:
        print ("Successfully created the directory %s " % savepath)

def save_images(savepath, tweet, i):
    z = 1
    #print(tweet)
    entities = tweet['entities']
    tweet_url = tweet['id']
    created_at = tweet['created_at']
    tweet_media = entities['media']
    for media in tweet_media: # a tweet can have multiple images/videos
        media_url = str(media['media_url_https'])
        file_name = media['id']
        try:
            pic = urllib.request.urlopen(media_url)
            file_path = savepath + "/" + searchtag + str(i) + "-" + str(z) + ".jpg"
            with open(file_path, 'wb') as localFile:
                localFile.write(pic.read())
            z += 1
            #save image origin info
            data.append([tweet_url, created_at, file_name, media_url])
        except Exception as e:
            print('exception at counter ' + str(counter))
    
# detect the current working directory and print it
base_path = os.getcwd()
print ("The current working directory is %s" % base_path)
img_dir = '/twitter/downloaded_media/'
data = []

# the write path in which the data will be stored. If it does not yet exist, it will be created
file_path = base_path + img_dir

#createDir(file_path)
unique_folder_path = file_path + defineFolderName(searchtag)
createDir(unique_folder_path) 

print(f"Retrieving a total max of {max_results} Tweets:")
with twitter.get(query_url, stream=True) as response:
    for i, raw_tweet in enumerate(response.iter_lines()):
        if i == max_results:
            break
        try:
            tweet = json.loads(raw_tweet)
            save_images(unique_folder_path, tweet, i)
            
        except (json.JSONDecodeError, KeyError) as err:
            # In case the JSON fails to decode, we skip this tweet
            print(f"{i+1}/{max_results}: ERROR: encountered a problem with a line of data... ")
            continue

image_df = pd.DataFrame (data, columns = ['created_at', 'tweet_id', 'media_id', 'media_url'])
image_df

https://stream.twitter.com/1.1/statuses/filter.json?track=chart&language=en&has=media
The current working directory is C:\Users\Flo\relataly-public-python-tutorials
Successfully created the directory C:\Users\Flo\relataly-public-python-tutorials/twitter/downloaded_media/02012021-170503-chart 
Retrieving a total max of 20 Tweets:
1/20: ERROR: encountered a problem with a line of data... 
2/20: ERROR: encountered a problem with a line of data... 
3/20: ERROR: encountered a problem with a line of data... 
4/20: ERROR: encountered a problem with a line of data... 
5/20: ERROR: encountered a problem with a line of data... 
6/20: ERROR: encountered a problem with a line of data... 
7/20: ERROR: encountered a problem with a line of data... 
8/20: ERROR: encountered a problem with a line of data... 
9/20: ERROR: encountered a problem with a line of data... 
10/20: ERROR: encountered a problem with a line of data... 
11/20: ERROR: encountered a problem with a line of data... 
12/20: ERROR: enco

Unnamed: 0,created_at,tweet_id,media_id,media_url
