In [56]:
!pip install -q selenium undetected-chromedriver pandas tqdm
!wget -q -O chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt-get update -y > /dev/null
!apt-get install -y libvulkan1 libnss3 libxss1 libgconf-2-4 libasound2 > /dev/null
!dpkg -i chrome.deb
!apt-get -f install -y > /dev/null

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
(Reading database ... 126633 files and directories currently installed.)
Preparing to unpack chrome.deb ...
Unpacking google-chrome-stable (135.0.7049.95-1) over (135.0.7049.95-1) ...
Setting up google-chrome-stable (135.0.7049.95-1) ...
Processing triggers for mailcap (3.70+nmu1ubuntu1) ...
Processing triggers for man-db (2.10.2-1) ...


In [57]:
!google-chrome --version

Google Chrome 135.0.7049.95 


In [63]:
import os
import re
import time
import pandas as pd
import requests
from tqdm import tqdm
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

import shutil
import uuid
from google.colab import files

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Upload "twigema_release.csv“ (Expand Files at left side) ###

In [59]:
INPUT_CSV_PATH = "/content/twigma_release.csv"   # Upload the twigma_release.csv

### Set the Start and End for partial process ###

In [60]:
START_ROW = 0
END_ROW = 100  # Set your own range

OUTPUT_IMAGE_DIR = f"/content/images_{START_ROW}_{END_ROW}"
os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)

# Load and filter single-image tweets
df_all = pd.read_csv(INPUT_CSV_PATH)
df_all["tweet_id"] = df_all["id"].astype(str)
df_chunk = df_all.iloc[START_ROW:END_ROW].copy()

tweet_counts = df_chunk["tweet_id"].value_counts()
single_tweet_ids = tweet_counts[tweet_counts == 1].index
df = df_chunk[df_chunk["tweet_id"].isin(single_tweet_ids)].copy()
print(f"✅ Processing {len(df)} single-image tweets only")

# Launch headless Chrome browser
options = uc.ChromeOptions()
options.binary_location = "/usr/bin/google-chrome"
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

✅ Processing 61 single-image tweets only


### Start Crawl ###

In [61]:
cdn_names = []
tweet_texts = []
image_success = []
text_success = []
row_indices = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing tweets"):
    tweet_id = row["tweet_id"]
    tweet_url = f"https://twitter.com/i/web/status/{tweet_id}"
    row_index = row.name  # Original row number in the TWIGMA dataset

    media_url = ""
    success = False
    tweet_text = ""
    text_ok = False

    try:
        driver.get(tweet_url)
        # time.sleep(0.5)

        # ✅ Extract tweet text
        try:
            tweet_elem = WebDriverWait(driver, 1.5).until(
                EC.presence_of_element_located((By.XPATH, '//article//div[@data-testid="tweetText"]'))
            )
            tweet_text = tweet_elem.text.strip()
            text_ok = True
        except:
            print(f"[WARN] ❌ Failed to extract tweet text: {tweet_id}")

        # ✅ Extract image URL
        try:
            img_elem = WebDriverWait(driver, 0.5).until(
                EC.presence_of_element_located((By.XPATH, '//article//img[contains(@src, "pbs.twimg.com/media")]'))
            )
            media_url = img_elem.get_attribute("src")
        except:
            print(f"[WARN] ❌ Failed to locate image element: {tweet_id}")

        # ✅ Download image (record URL even if download fails)
        if media_url:
            try:
                parsed = urlparse(media_url)
                cdn_filename = os.path.basename(parsed.path)
                if not re.search(r"\.(jpg|jpeg|png|webp)$", cdn_filename):
                    cdn_filename += ".jpg"

                full_filename = f"{tweet_id}_{cdn_filename}"
                full_path = os.path.join(OUTPUT_IMAGE_DIR, full_filename)

                img_data = requests.get(media_url, timeout=5).content
                with open(full_path, "wb") as f:
                    f.write(img_data)

                cdn_names.append(cdn_filename)
                success = True  # ← download successful

            except Exception as e:
                print(f"[WARN] 🌐 Image download failed: {tweet_id} → {media_url}")
                cdn_names.append(media_url)  # Save image URL for retry

        else:
            cdn_names.append("")

    except Exception as e:
        print(f"[ERROR] 🚨 Tweet {tweet_id} failed: {e}")
        cdn_names.append("")
        tweet_text = ""
        text_ok = False

    # ✅ Record result for each tweet
    tweet_texts.append(tweet_text)
    text_success.append(text_ok)
    image_success.append(success)
    row_indices.append(row_index)


Processing tweets:   0%|          | 0/61 [00:00<?, ?it/s]

[WARN] ❌ Failed to extract tweet text: 1608973782324252673


Processing tweets:   2%|▏         | 1/61 [00:12<12:52, 12.88s/it]

[WARN] ❌ Failed to locate image element: 1608973782324252673


Processing tweets:   8%|▊         | 5/61 [00:26<03:44,  4.00s/it]

[WARN] ❌ Failed to extract tweet text: 1608970676441960448


Processing tweets:  10%|▉         | 6/61 [00:29<03:21,  3.66s/it]

[WARN] ❌ Failed to locate image element: 1608970676441960448


Processing tweets:  16%|█▋        | 10/61 [00:37<02:00,  2.36s/it]

[WARN] ❌ Failed to extract tweet text: 1608969867918544896


Processing tweets:  18%|█▊        | 11/61 [00:40<02:00,  2.41s/it]

[WARN] ❌ Failed to locate image element: 1608969867918544896
[WARN] ❌ Failed to extract tweet text: 1608969819122171904


Processing tweets:  20%|█▉        | 12/61 [00:42<01:59,  2.43s/it]

[WARN] ❌ Failed to locate image element: 1608969819122171904
[WARN] ❌ Failed to extract tweet text: 1608969674544353281


Processing tweets:  21%|██▏       | 13/61 [00:46<02:12,  2.76s/it]

[WARN] ❌ Failed to extract tweet text: 1608969353957085184


Processing tweets:  23%|██▎       | 14/61 [00:49<02:16,  2.91s/it]

[WARN] ❌ Failed to locate image element: 1608969353957085184
[WARN] ❌ Failed to extract tweet text: 1608969014767910913


Processing tweets:  25%|██▍       | 15/61 [00:52<02:16,  2.98s/it]

[WARN] ❌ Failed to locate image element: 1608969014767910913
[WARN] ❌ Failed to extract tweet text: 1608968887411920898


Processing tweets:  26%|██▌       | 16/61 [00:58<03:02,  4.06s/it]

[WARN] ❌ Failed to locate image element: 1608968887411920898
[WARN] ❌ Failed to extract tweet text: 1608968449048580097


Processing tweets:  28%|██▊       | 17/61 [01:01<02:43,  3.72s/it]

[WARN] ❌ Failed to locate image element: 1608968449048580097


Processing tweets:  30%|██▉       | 18/61 [01:04<02:19,  3.24s/it]

[WARN] ❌ Failed to extract tweet text: 1608967399885398016


Processing tweets:  31%|███       | 19/61 [01:06<02:06,  3.02s/it]

[WARN] ❌ Failed to locate image element: 1608967399885398016
[WARN] ❌ Failed to extract tweet text: 1608967188429537282


Processing tweets:  33%|███▎      | 20/61 [01:09<02:05,  3.05s/it]

[WARN] ❌ Failed to locate image element: 1608967188429537282
[WARN] ❌ Failed to extract tweet text: 1608966914440675331


Processing tweets:  34%|███▍      | 21/61 [01:12<02:02,  3.05s/it]

[WARN] ❌ Failed to locate image element: 1608966914440675331
[WARN] ❌ Failed to extract tweet text: 1608966285404037120


Processing tweets:  36%|███▌      | 22/61 [01:15<01:58,  3.04s/it]

[WARN] ❌ Failed to locate image element: 1608966285404037120
[WARN] ❌ Failed to extract tweet text: 1608965933867106306


Processing tweets:  38%|███▊      | 23/61 [01:18<01:57,  3.09s/it]

[WARN] ❌ Failed to locate image element: 1608965933867106306
[WARN] ❌ Failed to extract tweet text: 1608965361000648704


Processing tweets:  39%|███▉      | 24/61 [01:22<01:55,  3.11s/it]

[WARN] ❌ Failed to locate image element: 1608965361000648704


Processing tweets:  43%|████▎     | 26/61 [01:26<01:30,  2.59s/it]

[WARN] ❌ Failed to extract tweet text: 1608961862196166659


Processing tweets:  44%|████▍     | 27/61 [01:28<01:27,  2.56s/it]

[WARN] ❌ Failed to locate image element: 1608961862196166659
[WARN] ❌ Failed to extract tweet text: 1608961433047371778


Processing tweets:  46%|████▌     | 28/61 [01:31<01:24,  2.56s/it]

[WARN] ❌ Failed to locate image element: 1608961433047371778
[WARN] ❌ Failed to extract tweet text: 1608961139307876352


Processing tweets:  48%|████▊     | 29/61 [01:34<01:27,  2.73s/it]

[WARN] ❌ Failed to locate image element: 1608961139307876352
[WARN] ❌ Failed to extract tweet text: 1608961126275989509


Processing tweets:  49%|████▉     | 30/61 [01:37<01:25,  2.76s/it]

[WARN] ❌ Failed to locate image element: 1608961126275989509


Processing tweets:  51%|█████     | 31/61 [01:40<01:24,  2.82s/it]

[WARN] ❌ Failed to extract tweet text: 1608957225669738501


Processing tweets:  52%|█████▏    | 32/61 [01:43<01:23,  2.88s/it]

[WARN] ❌ Failed to locate image element: 1608957225669738501
[WARN] ❌ Failed to extract tweet text: 1608957219361656832


Processing tweets:  54%|█████▍    | 33/61 [01:45<01:16,  2.74s/it]

[WARN] ❌ Failed to locate image element: 1608957219361656832
[WARN] ❌ Failed to extract tweet text: 1608954834962587649


Processing tweets:  56%|█████▌    | 34/61 [01:48<01:11,  2.67s/it]

[WARN] ❌ Failed to locate image element: 1608954834962587649
[WARN] ❌ Failed to extract tweet text: 1608954255104446464


Processing tweets:  57%|█████▋    | 35/61 [01:50<01:08,  2.63s/it]

[WARN] ❌ Failed to locate image element: 1608954255104446464


Processing tweets:  59%|█████▉    | 36/61 [01:53<01:07,  2.70s/it]

[WARN] ❌ Failed to extract tweet text: 1608951747564961794


Processing tweets:  61%|██████    | 37/61 [01:57<01:15,  3.13s/it]

[WARN] ❌ Failed to locate image element: 1608951747564961794
[WARN] ❌ Failed to extract tweet text: 1608950257005608965


Processing tweets:  62%|██████▏   | 38/61 [02:00<01:12,  3.13s/it]

[WARN] ❌ Failed to locate image element: 1608950257005608965
[WARN] ❌ Failed to extract tweet text: 1608949661783699457


Processing tweets:  64%|██████▍   | 39/61 [02:03<01:08,  3.10s/it]

[WARN] ❌ Failed to locate image element: 1608949661783699457


Processing tweets:  67%|██████▋   | 41/61 [02:09<00:55,  2.77s/it]

[WARN] ❌ Failed to extract tweet text: 1608946862069014529


Processing tweets:  69%|██████▉   | 42/61 [02:11<00:51,  2.69s/it]

[WARN] ❌ Failed to locate image element: 1608946862069014529
[WARN] ❌ Failed to extract tweet text: 1608946224622882816


Processing tweets:  70%|███████   | 43/61 [02:14<00:47,  2.63s/it]

[WARN] ❌ Failed to locate image element: 1608946224622882816
[WARN] ❌ Failed to extract tweet text: 1608946155378933762


Processing tweets:  72%|███████▏  | 44/61 [02:16<00:44,  2.60s/it]

[WARN] ❌ Failed to locate image element: 1608946155378933762
[WARN] ❌ Failed to extract tweet text: 1608946037150089217


Processing tweets:  74%|███████▍  | 45/61 [02:19<00:42,  2.67s/it]

[WARN] ❌ Failed to locate image element: 1608946037150089217
[WARN] ❌ Failed to extract tweet text: 1608945586199461889


Processing tweets:  75%|███████▌  | 46/61 [02:22<00:41,  2.77s/it]

[WARN] ❌ Failed to locate image element: 1608945586199461889
[WARN] ❌ Failed to extract tweet text: 1608944551808618496


Processing tweets:  77%|███████▋  | 47/61 [02:25<00:39,  2.80s/it]

[WARN] ❌ Failed to locate image element: 1608944551808618496
[WARN] ❌ Failed to extract tweet text: 1608944095363485696


Processing tweets:  79%|███████▊  | 48/61 [02:28<00:36,  2.83s/it]

[WARN] ❌ Failed to locate image element: 1608944095363485696


Processing tweets:  80%|████████  | 49/61 [02:30<00:31,  2.65s/it]

[WARN] ❌ Failed to extract tweet text: 1608943178283102211


Processing tweets:  82%|████████▏ | 50/61 [02:33<00:31,  2.82s/it]

[WARN] ❌ Failed to locate image element: 1608943178283102211
[WARN] ❌ Failed to extract tweet text: 1608942805308829697


Processing tweets:  84%|████████▎ | 51/61 [02:36<00:27,  2.73s/it]

[WARN] ❌ Failed to locate image element: 1608942805308829697
[WARN] ❌ Failed to extract tweet text: 1608942775957065728


Processing tweets:  85%|████████▌ | 52/61 [02:38<00:23,  2.65s/it]

[WARN] ❌ Failed to locate image element: 1608942775957065728
[WARN] ❌ Failed to extract tweet text: 1608942771725029378


Processing tweets:  87%|████████▋ | 53/61 [02:41<00:22,  2.77s/it]

[WARN] ❌ Failed to locate image element: 1608942771725029378
[WARN] ❌ Failed to extract tweet text: 1608942591311241219


Processing tweets:  89%|████████▊ | 54/61 [02:44<00:20,  2.89s/it]

[WARN] ❌ Failed to locate image element: 1608942591311241219
[WARN] ❌ Failed to extract tweet text: 1608941701065998337


Processing tweets:  90%|█████████ | 55/61 [02:47<00:17,  2.94s/it]

[WARN] ❌ Failed to locate image element: 1608941701065998337
[WARN] ❌ Failed to extract tweet text: 1608941383695630336


Processing tweets:  92%|█████████▏| 56/61 [02:50<00:14,  2.93s/it]

[WARN] ❌ Failed to locate image element: 1608941383695630336
[WARN] ❌ Failed to extract tweet text: 1608941182511624194


Processing tweets:  93%|█████████▎| 57/61 [02:53<00:11,  2.82s/it]

[WARN] ❌ Failed to locate image element: 1608941182511624194
[WARN] ❌ Failed to extract tweet text: 1608939693957353473


Processing tweets:  95%|█████████▌| 58/61 [02:55<00:08,  2.72s/it]

[WARN] ❌ Failed to locate image element: 1608939693957353473
[WARN] ❌ Failed to extract tweet text: 1608939608330620930


Processing tweets:  97%|█████████▋| 59/61 [02:58<00:05,  2.78s/it]

[WARN] ❌ Failed to locate image element: 1608939608330620930
[WARN] ❌ Failed to extract tweet text: 1608939155337379840


Processing tweets:  98%|█████████▊| 60/61 [03:01<00:02,  2.86s/it]

[WARN] ❌ Failed to locate image element: 1608939155337379840
[WARN] ❌ Failed to extract tweet text: 1608938899845382145


Processing tweets: 100%|██████████| 61/61 [03:04<00:00,  3.03s/it]

[WARN] ❌ Failed to locate image element: 1608938899845382145





### Saving CSV ###

In [62]:
# ✅ Save results to CSV
df["tweet_text"] = tweet_texts
df["cdn_image_name"] = cdn_names
df["text_success"] = text_success
df["image_success"] = image_success
df["twigma_row"] = row_indices

OUTPUT_CSV_PATH = f"twigma_scrape_rows_{START_ROW}_{END_ROW}.csv"
df.to_csv(OUTPUT_CSV_PATH, index=False)

print(f"\n✅ Results saved to: {OUTPUT_CSV_PATH}")

# === ✅ Summary statistics ===
total = len(df)
text_ok = df["text_success"].sum()
image_ok = df["image_success"].sum()
both_ok = ((df["text_success"]) & (df["image_success"])).sum()

def pct(x): return f"{100 * x / total:.2f}%"

print("\n📊 Summary:")
print(f"📝 Tweets with text successfully extracted: {text_ok}/{total} ({pct(text_ok)})")
print(f"🖼️  Tweets with image successfully downloaded: {image_ok}/{total} ({pct(image_ok)})")
print(f"✅ Tweets with both text and image: {both_ok}/{total} ({pct(both_ok)})")


✅ Results saved to: twigma_scrape_rows_0_100.csv

📊 Summary:
📝 Tweets with text successfully extracted: 16/61 (26.23%)
🖼️  Tweets with image successfully downloaded: 17/61 (27.87%)
✅ Tweets with both text and image: 16/61 (26.23%)


### Downloading Files ###

In [64]:
# ✅ Zip image directory
zip_name = OUTPUT_IMAGE_DIR + ".zip"
shutil.make_archive(OUTPUT_IMAGE_DIR, 'zip', OUTPUT_IMAGE_DIR)

# ✅ Download files
files.download(zip_name)
files.download(OUTPUT_CSV_PATH)

print("📥 Download links generated for CSV and image archive.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Download links generated for CSV and image archive.


### Retry (Not recommend for Efficiency) ###

In [66]:
RE_START_ROW = 0
RE_END_ROW = 100  # Set your own range

# === CONFIG ===
RETRY_CSV_PATH = f"twigma_scrape_rows_{RE_START_ROW}_{RE_END_ROW}.csv"
OUTPUT_IMAGE_DIR = f"images_{RE_START_ROW}_{RE_END_ROW}"
os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)

options = uc.ChromeOptions()
options.binary_location = "/usr/bin/google-chrome"
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

# === LOAD DATA ===
df = pd.read_csv(RETRY_CSV_PATH)
df_retry = df[(df["text_success"] == False) | (df["image_success"] == False)].copy()
print(f"🔁 Total to retry: {len(df_retry)}")

# === RECORD NEW RESULTS ===
new_text_success = 0
new_image_success = 0

for idx, row in tqdm(df_retry.iterrows(), total=len(df_retry), desc="🔁 Retrying failed entries"):
    tweet_id = str(row["tweet_id"])
    tweet_url = f"https://twitter.com/i/web/status/{tweet_id}"
    cdn_image_old = row["cdn_image_name"]

    # Retry flag
    got_text = False
    got_image = False

    try:
        driver.get(tweet_url)
        # time.sleep(0.5)

        # Retry text if it failed
        if row["text_success"] == False:
            try:
                tweet_elem = WebDriverWait(driver, 3).until(
                    EC.presence_of_element_located((By.XPATH, '//article//div[@data-testid="tweetText"]'))
                )
                tweet_text = tweet_elem.text.strip()
                df.loc[idx, "tweet_text"] = tweet_text
                df.loc[idx, "text_success"] = True
                new_text_success += 1
                got_text = True
            except:
                pass

        # Retry image if it failed
        if row["image_success"] == False:
            try:
                img_elem = WebDriverWait(driver, 1).until(
                    EC.presence_of_element_located((By.XPATH, '//article//img[contains(@src, "pbs.twimg.com/media")]'))
                )
                media_url = img_elem.get_attribute("src")

                parsed = urlparse(media_url)
                cdn_filename = os.path.basename(parsed.path)
                if not re.search(r"\.(jpg|jpeg|png|webp)$", cdn_filename):
                    cdn_filename += ".jpg"

                full_filename = f"{tweet_id}_{cdn_filename}"
                full_path = os.path.join(OUTPUT_IMAGE_DIR, full_filename)

                img_data = requests.get(media_url, timeout=5).content
                with open(full_path, "wb") as f:
                    f.write(img_data)

                df.loc[idx, "cdn_image_name"] = cdn_filename
                df.loc[idx, "image_success"] = True
                new_image_success += 1
                got_image = True

            except:
                pass

    except Exception as e:
        print(f"[FAIL] Tweet {tweet_id}: {e}")

driver.quit()

# === SAVE NEW MERGED RESULT ===
REPAIRED_CSV_PATH = RETRY_CSV_PATH.replace(".csv", "_repaired.csv")
df.to_csv(REPAIRED_CSV_PATH, index=False)

# === STATS ===
total_retry = len(df_retry)
final_total = len(df)
final_text = df["text_success"].sum()
final_img = df["image_success"].sum()
final_both = ((df["text_success"]) & (df["image_success"])).sum()

def pct(x, total): return f"{100 * x / total:.2f}%"

print("\n📊 Retry Summary:")
print(f"📝 Newly recovered texts:  {new_text_success}/{total_retry} ({pct(new_text_success, total_retry)})")
print(f"🖼️  Newly recovered images: {new_image_success}/{total_retry} ({pct(new_image_success, total_retry)})")

print("\n📦 Overall Total After Merge:")
print(f"📝 Total successful texts:  {final_text}/{final_total} ({pct(final_text, final_total)})")
print(f"🖼️  Total successful images: {final_img}/{final_total} ({pct(final_img, final_total)})")
print(f"✅ Fully successful entries: {final_both}/{final_total} ({pct(final_both, final_total)})")

print(f"\n✅ Updated CSV saved to: {REPAIRED_CSV_PATH}")

🔁 Total to retry: 45


🔁 Retrying failed entries: 100%|██████████| 45/45 [03:29<00:00,  4.66s/it]



📊 Retry Summary:
📝 Newly recovered texts:  5/45 (11.11%)
🖼️  Newly recovered images: 5/45 (11.11%)

📦 Overall Total After Merge:
📝 Total successful texts:  21/61 (34.43%)
🖼️  Total successful images: 22/61 (36.07%)
✅ Fully successful entries: 21/61 (34.43%)

✅ Updated CSV saved to: twigma_scrape_rows_0_100_repaired.csv


In [69]:
# ✅ Zip image directory
zip_name = OUTPUT_IMAGE_DIR + ".zip"
shutil.make_archive(OUTPUT_IMAGE_DIR, 'zip', OUTPUT_IMAGE_DIR)

# ✅ Download files
files.download(zip_name)
files.download(REPAIRED_CSV_PATH)

print("📥 Download links generated for Repaired-CSV and image archive.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Download links generated for Repaired-CSV and image archive.
