In [1]:
!pip install -q selenium undetected-chromedriver pandas tqdm
!wget -q -O chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt-get update -y > /dev/null
!apt-get install -y libvulkan1 libnss3 libxss1 libgconf-2-4 libasound2 > /dev/null
!dpkg -i chrome.deb
!apt-get -f install -y > /dev/null

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for undetected-chromedriver (setup.py) ... [?25l[?25hdone
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package google-chrome-stable.
(Reading database ... 126516 files and directories currently installed.)
Preparing to unpack chrome.deb ...
Unpacking google-chrome-sta

In [2]:
!google-chrome --version

Google Chrome 135.0.7049.95 


In [3]:
import os
import re
import time
import pandas as pd
import requests
from tqdm import tqdm
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

import shutil
import uuid
from google.colab import files

import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### Upload "twigema_release.csv“ (Expand Files at left side) ###

In [10]:
INPUT_CSV_PATH = "/content/twigma_release.csv"   # Upload the twigma_release.csv

### Set the Start and End for partial process ###

In [23]:
from google.colab import drive
drive.mount('/content/drive')
drive_base = "/content/drive/MyDrive/twigma_batches"
os.makedirs(drive_base, exist_ok=True)

# Launch headless Chrome browser
options = uc.ChromeOptions()
options.binary_location = "/usr/bin/google-chrome"
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)

# === BATCH MAIN FUNCTIONS ===
def load_and_filter_data(current_start, current_end):
    df_all = pd.read_csv(INPUT_CSV_PATH)
    df_all["tweet_id"] = df_all["id"].astype(str)
    df_chunk = df_all.iloc[current_start:current_end].copy()

    tweet_counts = df_chunk["tweet_id"].value_counts()
    single_tweet_ids = tweet_counts[tweet_counts == 1].index
    df_filtered = df_chunk[df_chunk["tweet_id"].isin(single_tweet_ids)].copy()

    print(f"✅ Processing {len(df_filtered)} single-image tweets only")

    return df_filtered


def restart_driver():
    global driver
    try:
        driver.quit()
    except:
        pass

    options = uc.ChromeOptions()
    options.binary_location = "/usr/bin/google-chrome"
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = uc.Chrome(options=options)
    print("♻️ Driver restarted.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Start Crawl ###

In [16]:
def process_batch(df, current_start, current_end):
    global driver
    OUTPUT_IMAGE_DIR = f"/content/images_{current_start}_{current_end}"
    os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)

    cdn_names = []
    tweet_texts = []
    image_success = []
    text_success = []
    row_indices = []

    actual_requests = 0
    sequent_requests = 0
    consecutive_failures = 0
    ALL_consecutive_failures = 0
    recent_texts = []
    recent_images = []

    start_time = time.time()

    for i, row in tqdm(df.iterrows(), total=len(df), desc="Processing tweets"):
        tweet_id = row["tweet_id"]
        tweet_url = f"https://twitter.com/i/web/status/{tweet_id}"
        row_index = row.name
        media_url = ""
        success = False
        tweet_text = ""
        text_ok = False

        try:
            driver.get(tweet_url)
            actual_requests += 1
            sequent_requests += 1

            try:
                tweet_elem = WebDriverWait(driver, 2.5).until(
                    EC.presence_of_element_located((By.XPATH, '//article//div[@data-testid="tweetText"]'))
                )
                tweet_text = tweet_elem.text.strip()
                text_ok = True
            except:
                print(f"\n[WARN] ❌ Failed to extract tweet text: {tweet_id}")

            try:
                img_elem = WebDriverWait(driver, 0.5).until(
                    EC.presence_of_element_located((By.XPATH, '//article//img[contains(@src, "pbs.twimg.com/media")]'))
                )
                media_url = img_elem.get_attribute("src")
            except:
                print(f"\n[WARN] ❌ Failed to locate image element: {tweet_id}")

            if media_url:
                try:
                    parsed = urlparse(media_url)
                    cdn_filename = os.path.basename(parsed.path)
                    if not re.search(r"\.(jpg|jpeg|png|webp)$", cdn_filename):
                        cdn_filename += ".jpg"

                    full_filename = f"{tweet_id}_{cdn_filename}"
                    full_path = os.path.join(OUTPUT_IMAGE_DIR, full_filename)

                    img_data = requests.get(media_url, timeout=5).content
                    with open(full_path, "wb") as f:
                        f.write(img_data)

                    cdn_names.append(cdn_filename)
                    success = True

                except Exception as e:
                    print(f"[WARN] 🌐 Image download failed: {tweet_id} → {media_url}")
                    cdn_names.append(media_url)
            else:
                cdn_names.append("")

        except Exception as e:
            print(f"[ERROR] 🚨 Tweet {tweet_id} failed: {e}")
            cdn_names.append("")
            tweet_text = ""
            text_ok = False

        recent_texts.append(text_ok)
        recent_images.append(success)

        tweet_texts.append(tweet_text)
        text_success.append(text_ok)
        image_success.append(success)
        row_indices.append(row_index)

        if actual_requests % 10 == 0:
            text_count = sum(recent_texts)
            image_count = sum(recent_images)
            both_count = sum(t and i for t, i in zip(recent_texts, recent_images))

            print(f"\n📊 Summary of the last 10 requests:")
            print(f"📝 Text success:  {text_count}/10")
            print(f"🖼️  Image success: {image_count}/10")
            print(f"✅ Both success:  {both_count}/10\n")
            recent_texts = []
            recent_images = []

        if not text_ok and not success:
            consecutive_failures += 1
            ALL_consecutive_failures += 1
        else:
            consecutive_failures = 0
            ALL_consecutive_failures = 0

        if sequent_requests >= 10:
            print(f"♻️ Restarting driver after {sequent_requests} requests...")
            restart_driver()
            time.sleep(10)
            consecutive_failures = 0
            sequent_requests = 0

        elif consecutive_failures >= 5:
            print(f"♻️ Restarting driver after {consecutive_failures} consecutive failures...")
            restart_driver()
            time.sleep(10)
            consecutive_failures = 0
            sequent_requests = 0
            if ALL_consecutive_failures >= 10:
                restart_driver()
                time.sleep(20)
                consecutive_failures = 0

    end_time = time.time()
    duration = end_time - start_time

    return {
        "df": df,
        "tweet_texts": tweet_texts,
        "cdn_image_name": cdn_names,
        "text_success": text_success,
        "image_success": image_success,
        "twigma_row": row_indices,
        "duration": duration,
        "output_image_dir": OUTPUT_IMAGE_DIR
    }

### Saving CSV & Summary & File Download ###

In [27]:
def save_results(result_dict, current_start, current_end):
    df = result_dict["df"].copy()
    df["tweet_text"] = result_dict["tweet_texts"]
    df["cdn_image_name"] = result_dict["cdn_image_name"]
    df["text_success"] = result_dict["text_success"]
    df["image_success"] = result_dict["image_success"]
    df["twigma_row"] = result_dict["twigma_row"]

    output_csv = f"twigma_scrape_rows_{current_start}_{current_end}.csv"
    df.to_csv(output_csv, index=False)

    total = len(df)
    text_ok = df["text_success"].sum()
    image_ok = df["image_success"].sum()
    both_ok = ((df["text_success"]) & (df["image_success"])).sum()
    duration = result_dict["duration"]

    def pct(x): return f"{100 * x / total:.2f}%"
    def format_duration(seconds):
        mins, secs = divmod(int(seconds), 60)
        hrs, mins = divmod(mins, 60)
        return f"{hrs}h {mins}m {secs}s"

    summary_txt = output_csv.replace(".csv", "_summary.txt")
    with open(summary_txt, "w") as f:
        f.write(f"{summary_txt}\n")
        f.write("📊 Summary:\n")
        f.write(f"📝 Text: {text_ok}/{total} ({pct(text_ok)})\n")
        f.write(f"🖼️  Image: {image_ok}/{total} ({pct(image_ok)})\n")
        f.write(f"✅ Both: {both_ok}/{total} ({pct(both_ok)})\n")
        f.write(f"⏱️ Duration: {format_duration(duration)} ({int(duration)}s)\n")
        if both_ok > 0:
            f.write(f"⏱️ Avg time per success: {duration / both_ok:.2f} seconds\n")

    shutil.make_archive(result_dict["output_image_dir"], 'zip', result_dict["output_image_dir"])

    # files.download(f"{result_dict['output_image_dir']}.zip")
    # files.download(output_csv)
    # files.download(summary_txt)
    # print("📦 Downloads ready for:", current_start, "to", current_end)

    drive_base = "/content/drive/MyDrive/twigma_batches"
    os.makedirs(drive_base, exist_ok=True)

    shutil.copy(f"{result_dict['output_image_dir']}.zip", drive_base)
    shutil.copy(output_csv, drive_base)
    shutil.copy(summary_txt, drive_base)

    print(f"📂 Files saved to Google Drive folder: {drive_base}")


### Main LOOP ###

In [28]:
START_ROW = 30000
END_ROW = 35000  # Set your own range
BATCH_SIZE = 1000
INPUT_CSV_PATH = "twigma_release.csv"

# === MAIN EXECUTION LOOP ===
for current_start in range(START_ROW, END_ROW, BATCH_SIZE):
    current_end = min(current_start + BATCH_SIZE, END_ROW)
    print(f"\n🚀 Batch {current_start} to {current_end}")
    restart_driver()
    df_filtered = load_and_filter_data(current_start, current_end)
    result_dict = process_batch(df_filtered, current_start, current_end)
    save_results(result_dict, current_start, current_end)


🚀 Batch 30000 to 30004
♻️ Driver restarted.
✅ Processing 4 single-image tweets only


Processing tweets:   0%|          | 0/4 [00:00<?, ?it/s]


[WARN] ❌ Failed to extract tweet text: 1599871731006550016


Processing tweets:  25%|██▌       | 1/4 [00:04<00:14,  4.96s/it]


[WARN] ❌ Failed to locate image element: 1599871731006550016

[WARN] ❌ Failed to extract tweet text: 1599871237047533568


Processing tweets:  50%|█████     | 2/4 [00:09<00:09,  4.93s/it]


[WARN] ❌ Failed to locate image element: 1599871237047533568


Processing tweets:  75%|███████▌  | 3/4 [00:13<00:04,  4.22s/it]


[WARN] ❌ Failed to extract tweet text: 1599871105619009537


Processing tweets: 100%|██████████| 4/4 [00:17<00:00,  4.36s/it]


[WARN] ❌ Failed to locate image element: 1599871105619009537
📂 Files saved to Google Drive folder: /content/drive/MyDrive/twigma_batches

🚀 Batch 30004 to 30008





♻️ Driver restarted.
✅ Processing 0 single-image tweets only


Processing tweets: 0it [00:00, ?it/s]

📂 Files saved to Google Drive folder: /content/drive/MyDrive/twigma_batches

🚀 Batch 30008 to 30010



  def pct(x): return f"{100 * x / total:.2f}%"


♻️ Driver restarted.
✅ Processing 2 single-image tweets only


Processing tweets:   0%|          | 0/2 [00:00<?, ?it/s]


[WARN] ❌ Failed to extract tweet text: 1599867359199911936


Processing tweets:  50%|█████     | 1/2 [00:08<00:08,  8.60s/it]


[WARN] ❌ Failed to locate image element: 1599867359199911936

[WARN] ❌ Failed to extract tweet text: 1599867231252279297


Processing tweets: 100%|██████████| 2/2 [00:14<00:00,  7.49s/it]


[WARN] ❌ Failed to locate image element: 1599867231252279297
📂 Files saved to Google Drive folder: /content/drive/MyDrive/twigma_batches





### Retry (Not recommend for Efficiency) ###

In [19]:
# RE_START_ROW = 0
# RE_END_ROW = 100  # Set your own range

# # === CONFIG ===
# RETRY_CSV_PATH = f"twigma_scrape_rows_{RE_START_ROW}_{RE_END_ROW}.csv"
# OUTPUT_IMAGE_DIR = f"images_{RE_START_ROW}_{RE_END_ROW}"
# os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)

# options = uc.ChromeOptions()
# options.binary_location = "/usr/bin/google-chrome"
# options.add_argument("--headless=new")
# options.add_argument("--no-sandbox")
# options.add_argument("--disable-dev-shm-usage")
# driver = uc.Chrome(options=options)

# # === LOAD DATA ===
# df = pd.read_csv(RETRY_CSV_PATH)
# df_retry = df[(df["text_success"] == False) | (df["image_success"] == False)].copy()
# print(f"🔁 Total to retry: {len(df_retry)}")

# # === RECORD NEW RESULTS ===
# new_text_success = 0
# new_image_success = 0

# for idx, row in tqdm(df_retry.iterrows(), total=len(df_retry), desc="🔁 Retrying failed entries"):
#     tweet_id = str(row["tweet_id"])
#     tweet_url = f"https://twitter.com/i/web/status/{tweet_id}"
#     cdn_image_old = row["cdn_image_name"]

#     # Retry flag
#     got_text = False
#     got_image = False

#     try:
#         driver.get(tweet_url)
#         # time.sleep(0.5)

#         # Retry text if it failed
#         if row["text_success"] == False:
#             try:
#                 tweet_elem = WebDriverWait(driver, 3).until(
#                     EC.presence_of_element_located((By.XPATH, '//article//div[@data-testid="tweetText"]'))
#                 )
#                 tweet_text = tweet_elem.text.strip()
#                 df.loc[idx, "tweet_text"] = tweet_text
#                 df.loc[idx, "text_success"] = True
#                 new_text_success += 1
#                 got_text = True
#             except:
#                 pass

#         # Retry image if it failed
#         if row["image_success"] == False:
#             try:
#                 img_elem = WebDriverWait(driver, 1).until(
#                     EC.presence_of_element_located((By.XPATH, '//article//img[contains(@src, "pbs.twimg.com/media")]'))
#                 )
#                 media_url = img_elem.get_attribute("src")

#                 parsed = urlparse(media_url)
#                 cdn_filename = os.path.basename(parsed.path)
#                 if not re.search(r"\.(jpg|jpeg|png|webp)$", cdn_filename):
#                     cdn_filename += ".jpg"

#                 full_filename = f"{tweet_id}_{cdn_filename}"
#                 full_path = os.path.join(OUTPUT_IMAGE_DIR, full_filename)

#                 img_data = requests.get(media_url, timeout=5).content
#                 with open(full_path, "wb") as f:
#                     f.write(img_data)

#                 df.loc[idx, "cdn_image_name"] = cdn_filename
#                 df.loc[idx, "image_success"] = True
#                 new_image_success += 1
#                 got_image = True

#             except:
#                 pass

#     except Exception as e:
#         print(f"[FAIL] Tweet {tweet_id}: {e}")

# driver.quit()

# # === SAVE NEW MERGED RESULT ===
# REPAIRED_CSV_PATH = RETRY_CSV_PATH.replace(".csv", "_repaired.csv")
# df.to_csv(REPAIRED_CSV_PATH, index=False)

# # === STATS ===
# total_retry = len(df_retry)
# final_total = len(df)
# final_text = df["text_success"].sum()
# final_img = df["image_success"].sum()
# final_both = ((df["text_success"]) & (df["image_success"])).sum()

# def pct(x, total): return f"{100 * x / total:.2f}%"

# print("\n📊 Retry Summary:")
# print(f"📝 Newly recovered texts:  {new_text_success}/{total_retry} ({pct(new_text_success, total_retry)})")
# print(f"🖼️  Newly recovered images: {new_image_success}/{total_retry} ({pct(new_image_success, total_retry)})")

# print("\n📦 Overall Total After Merge:")
# print(f"📝 Total successful texts:  {final_text}/{final_total} ({pct(final_text, final_total)})")
# print(f"🖼️  Total successful images: {final_img}/{final_total} ({pct(final_img, final_total)})")
# print(f"✅ Fully successful entries: {final_both}/{final_total} ({pct(final_both, final_total)})")

# print(f"\n✅ Updated CSV saved to: {REPAIRED_CSV_PATH}")

In [20]:
# # ✅ Zip image directory
# zip_name = OUTPUT_IMAGE_DIR + ".zip"
# shutil.make_archive(OUTPUT_IMAGE_DIR, 'zip', OUTPUT_IMAGE_DIR)

# # ✅ Download files
# files.download(zip_name)
# files.download(REPAIRED_CSV_PATH)

# print("📥 Download links generated for Repaired-CSV and image archive.")