In [None]:
import tweepy
import pandas as pd
import json
import csv
import time
import requests
import os
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
import textwrap
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)  # No truncation of text

In [None]:
BEARER_TOKEN = ""
CLIENT = tweepy.Client(BEARER_TOKEN)

In [None]:
bc_ab_query = '(#BCwildfire OR #BCfire OR #ABWildfire OR #albertawildfire OR #ABFire) -has:videos has:images lang:en -is:retweet -is:quote -is:reply'
#bc_ab_jasper_query = '(#JasperStrong OR #JasperWildfire OR #JasperAB OR #BCwildfire OR #BCfire OR #ABWildfire OR #albertawildfire OR #ABFire) -has:videos has:images lang:en -is:retweet -is:quote -is:reply'

wildfire_start = '2022-05-01T00:00:00Z' #may 1st
wildfire_end = '2022-10-01T00:00:00Z'   #oct 1st

In [None]:
# Basic-tier $100/mo
def scrapeTweets(query, next_token=None):
    response = CLIENT.search_recent_tweets(
        query = query,
        max_results = 100,
        media_fields = ['media_key', 'type', 'preview_image_url','url','public_metrics','duration_ms', 'width'],
        place_fields = ['country', 'country_code', 'full_name', 'geo', 'id', 'name'],
        tweet_fields = ['created_at', 'geo', 'public_metrics', 'text','id', 'entities', 'lang', 'attachments'],
        user_fields =  ['username', 'name', 'public_metrics', 'description', 'location','created_at','entities','id', 
                       'pinned_tweet_id','profile_image_url','protected','url','verified','withheld'],
        next_token = next_token,
        expansions = ['attachments.media_keys', 'author_id', 'geo.place_id'])
    return response

In [None]:
# Pro-tier $5000/mo
def scrapeTweets(query, start_time, end_time, next_token=None):
    response = CLIENT.search_all_tweets(
        query = query,
        max_results = 500,
        start_time = start_time,
        end_time = end_time,
        media_fields = ['media_key', 'type', 'preview_image_url','url','public_metrics','duration_ms', 'width'],
        place_fields = ['country', 'country_code', 'full_name', 'geo', 'id', 'name'],
        tweet_fields = ['created_at', 'geo', 'public_metrics', 'text','id', 'entities', 'lang', 'attachments'],
        user_fields =  ['username', 'name', 'public_metrics', 'description', 'location','created_at','entities','id', 
                       'pinned_tweet_id','profile_image_url','protected','url','verified','withheld'],
        next_token = next_token,
        expansions = ['attachments.media_keys', 'author_id', 'geo.place_id'])
    return response

In [None]:
# get x pages of responses
responses = []
nxt_tkn = None
for i in range(200):
    response = scrapeTweets(bc_ab_query, wildfire_start, wildfire_end, nxt_tkn)
    response = response._asdict()
    responses.append(response)
    try:
        nxt_tkn = response['meta']['next_token']
    except:
        break
    print('Collected', len(responses), 'pages of tweets...')
    time.sleep(3)

In [None]:
print('Collected', len(responses), 'pages of tweets...')
for i in range(len(responses)):
    data_length = len(responses[i]['data']) if responses[i].get('data') else 0
    print('Page', i, 'length:', data_length)

In [None]:
num_tweets = 0
num_photos = 0
num_users = 0

for i in range(len(responses)):
    data_length = len(responses[i]['data']) if responses[i].get('data') else 0
    num_tweets += data_length
    num_photos += len(responses[i]['includes']['media'])
    num_users += len(responses[i]['includes']['users'])

print('Total num. tweets:', num_tweets)
print('Total num. media:', num_photos)
print('Total num. users:', num_users)

In [None]:
# build dataframe from responses - for scraping with images
new_df = pd.DataFrame(columns=['tweet_id', 'img_id', 'posted_at', 'author_id', 'author_loc', 'author_name', 'author_usrname', 'text', 'media_keys', 'urls', 'path']) # place,

for i in range(len(responses)):
    media_dict = {media['media_key']: media['url'] for media in responses[i]['includes']['media']}
    for j in range(len(responses[i]['data'])):
        #print('i:', i, '|', 'j:', j)
        tweet_id = responses[i]['data'][j]['id']
        created_at = responses[i]['data'][j]['created_at']
        author_id = responses[i]['data'][j]['author_id']
        #place_id = responses[i]['data'][j]['geo']['place_id']
        text = responses[i]['data'][j]['text']
        media_keys = [d['media_key'] for d in responses[i]['data'][j]['entities']['urls'] if 'media_key' in d]

        # Retrieve URLs corresponding to the media keys
        media_urls = [media_dict.get(key, None) for key in media_keys]

        # Find the matching user
        user = next((u for u in responses[i]['includes']['users'] if u['id'] == author_id), None)
        if user:
            author_loc = user.get('location', 'n/a')
            author_name = user.get('name', 'n/a')
            author_usrname = user.get('username', 'n/a')
        else:
            author_loc = 'n/a'
            author_name = 'n/a'
            author_usrname = 'n/a'
        
        # # Find the matching place
        # place = next((u for u in responses[i]['includes']['places'] if u['id'] == place_id), None)
        # if place:
        #     place = place.get('full_name', 'n/a')
        # else:
        #     place = 'n/a'

        # Create img_ids and paths
        img_ids = []
        paths = []
        for url in media_urls:
            if url is None:
                img_ids.append(None)
                paths.append(None)
            else:
                img_id = url.split('/')[4].split('.')[0]
                ext = url.split('/')[4].split('.')[1]
                path = f"{tweet_id}_{img_id}.{ext}"
                img_ids.append(img_id)
                paths.append(path)
        
        # Append to df
        for k in range(len(media_urls)):
            new_row = {
                "tweet_id": tweet_id,
                "img_id": img_ids[k],
                "posted_at": created_at,
                "author_id": author_id,
                #"place": place,
                "author_loc": author_loc,
                "author_name": author_name,
                "author_usrname": author_usrname,
                "text": text,
                "media_keys": media_keys[k] if k < len(media_keys) else None,
                "urls": media_urls[k],
                "path": paths[k]
            }
            new_df.loc[len(new_df)] = new_row

In [None]:
# drop rows with videos instead of images
new_df.dropna(subset=['urls'], inplace=True)
new_df.reset_index(drop=True)

In [None]:
new_df

In [None]:
new_df.to_csv('dataset.csv', index=False)

In [None]:
# View some samples
sampled_df = new_df.sample(n=9)
# Create a plot
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.flatten()
for i, (index, row) in enumerate(sampled_df.iterrows()):
    url = row['urls']
    text = row['text']
    # Wrap the text to fit the plot
    wrapped_text = "\n".join(textwrap.wrap(text, width=40))
    # Fetch the image from the URL
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    # Plot the image with the wrapped text as the title
    axes[i].imshow(img)
    axes[i].set_title(wrapped_text, fontsize=12)
    axes[i].axis('off')
plt.subplots_adjust(hspace=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Download images from dataset to a local folder
def download_imgs(df, folder_pth):
    failed_ctr = 0
    failed_list = []
    log_messages = []

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Downloading images"):
        url = row['urls']
        path = row['path']
        image_path = os.path.join(folder_pth, path)

        if os.path.exists(image_path):
            log_messages.append(f"File already exists, skipping: {i}/{len(df)-1} {path}")
            continue

        response = requests.get(url)
        if response.status_code == 200:
            with open(image_path, 'wb') as file:
                file.write(response.content)
            log_messages.append(f"Downloaded and saved: {i}/{len(df)-1} {path}")
        else:
            log_messages.append(f"Failed to download: {i}/{len(df)-1} {url}")
            failed_ctr += 1
            failed_list.append((i, path))

    if failed_ctr > 0:
        log_messages.append(f'The following number of images failed to download: {failed_ctr}')
        log_messages.append(f"{failed_list}")

    # Write all log messages to a file after the loop
    with open('log.txt', 'a') as log_file:
        for message in log_messages:
            log_file.write(message + '\n')

df = pd.read_csv('dataset.csv')
download_imgs(df, 'dataset_image_folder')