In [None]:
def download_post_by_shortcode(shortcode):
    # download a specific post by shortcode
    import instaloader
    L = instaloader.Instaloader()
    
    try:
        # Load the post by shortcode
        post = instaloader.Post.from_shortcode(L.context, shortcode)
        print(f"Downloading post {shortcode}...")
        L.download_post(post, target=post.owner_username)
        print(f"Post {shortcode} has been downloaded.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
def get_urls(usernames):
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.by import By
    import time
    from dotenv import load_dotenv
    import os
    
    load_dotenv()
    
    IG_USER = os.getenv("IG_USER")
    IG_PASSWORD = os.getenv("IG_PASSWORD")

    SCROLL_PAUSE = 3
    options = webdriver.ChromeOptions()
    options.add_experimental_option("detach", True)

    # driver = webdriver.Chrome(options=options)
    driver = webdriver.Chrome()
    
    # login to Instagram
    driver.get("https://www.instagram.com/accounts/login/")
    time.sleep(5)

    # Fill in login form
    driver.find_element(By.NAME, "username").send_keys(IG_USER)
    driver.find_element(By.NAME, "password").send_keys(IG_PASSWORD)
    driver.find_element(By.NAME, "password").send_keys(Keys.RETURN)
    time.sleep(8)
    
    
    # Iterate through each username
    all_urls = []
    for username in usernames:
        driver.get(f"https://www.instagram.com/{username}/")
        time.sleep(5)
        # return
        
        # Scroll a few times to load more posts
        SCROLL_PAUSE = 4
        last_height = driver.execute_script("return document.body.scrollHeight")
        post_urls = set()

        while True:
            # Scroll to bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE)

            # Extract new post links
            links = driver.find_elements(By.TAG_NAME, "a")
            for link in links:
                href = link.get_attribute("href")
                if href and "/p/" in href:
                    post_urls.add(href)

            # Check if new scroll height has changed
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                print("🔽 Reached end of page.")
                break
            last_height = new_height


        # Collect post URLs
        links = driver.find_elements(By.TAG_NAME, "a")
        for link in links:
            href = link.get_attribute("href")
            if href and "/p/" in href:
                post_urls.add(href)
        post_urls = list(set(post_urls))
        all_urls.extend(post_urls)
        
    driver.quit()
    
    return all_urls


In [None]:
def get_shortcode_from_url(url):
    return url.split("/")[-2] if "/p/" in url else None

In [None]:
def get_all_user():
    import os
    not_user_dirs = ['.git', '.vscode', 'venv', '__pycache__', 'raw_data', 'final_data']
    # get all username directory under this directory
    current_dir = os.getcwd()
    user_dirs = [d for d in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, d)) and not d.startswith('.')]
    # remove directories that are not user directories
    user_dirs = [d for d in user_dirs if d not in not_user_dirs]
    return user_dirs

In [None]:
def organize_posts_by_timestamp(user_dir):
    import os
    import shutil
    import re
    # Regex to extract the prefix: e.g., 2018-02-17_09-05-15_UTC
    prefix_pattern = re.compile(r'^(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}_UTC)')
    user_name = os.path.basename(user_dir)

    # List all files in the user directory
    for file in os.listdir(user_dir):
        file_path = os.path.join(user_dir, file)

        # Skip directories
        if os.path.isdir(file_path):
            continue

        match = prefix_pattern.match(file)
        if match:
            prefix = match.group(1)
            post_dir = os.path.join(user_dir, f'{user_name}_{prefix}')
            os.makedirs(post_dir, exist_ok=True)

            # Move the file into the corresponding post directory
            target_path = os.path.join(post_dir, file)
            shutil.move(file_path, target_path)
            print(f"📂 Moved {file} → {post_dir}/")

    print("✅ Organization complete.")

In [None]:
def process_post_directory(post_dir):
    import os
    import shutil
    import re
    
    post_data = {}

    # --- Step 1: Find the .txt file ---
    txt_file = next((f for f in os.listdir(post_dir) if f.endswith('.txt')), None)
    if not txt_file:
        print(f"⚠️ No .txt file found in {post_dir}")
        return None

    txt_path = os.path.join(post_dir, txt_file)
    with open(txt_path, 'r', encoding='utf-8') as f:
        text = f.read().strip()

    # --- Step 2: Extract hashtags ---
    hashtags = re.findall(r"#\w+", text)

    # --- Step 3: Find the image file ---
    image_file = None
    for name in sorted(os.listdir(post_dir)):
        if re.match(r".*_1\.jpg$", name):
            image_file = name
            break
    if not image_file:
        # fallback: try single image (e.g., timestamp.jpg)
        jpg_files = [f for f in os.listdir(post_dir) if f.endswith(".jpg")]
        if jpg_files:
            image_file = sorted(jpg_files)[0]  # take the first one

    if not image_file:
        print(f"⚠️ No image file found in {post_dir}")
        return None

    image_src_path = os.path.join(post_dir, image_file)
    image_dst_dir = os.path.join("final_data", "images")
    os.makedirs(image_dst_dir, exist_ok=True)
    image_dst_path = os.path.join(image_dst_dir, image_file)

    shutil.copy(image_src_path, image_dst_path)

    # --- Step 4: Build relative image path ---
    image_path = os.path.relpath(image_dst_path, "final_data")

    # --- Step 5: Get author from parent directory ---
    author = os.path.basename(os.path.dirname(post_dir))

    # --- Step 6: Construct post dictionary ---
    post_data = {
        "text": text,
        "image_path": image_path.replace("\\", "/"),  # for Windows compatibility
        "author": author,
        "hashtags": hashtags
    }

    return post_data


# Main program

In [None]:
# read account_list.csv and return a list of usernames
import pandas as pd
df = pd.read_csv('account_list.csv')
original_accounts = df['ig 帳號'].tolist()

In [None]:
# some metadata
original_accounts_count = len(original_accounts)
post_count = 0

In [None]:
original_accounts = ["omega_3iana"]
post_count = 0

In [None]:
# get all posts urls
urls = get_urls(original_accounts)
# get all shortcodes from urls
shortcodes = [get_shortcode_from_url(url) for url in urls if get_shortcode_from_url(url)]

In [None]:
# download posts by shortcodes
for shortcode in shortcodes:
    download_post_by_shortcode(shortcode)

In [None]:
# get all user directories and move them to raw_data
import os
users = get_all_user()

# move all user directories to raw_data
for user_dir in users:
    if not os.path.exists('raw_data'):
        os.makedirs('raw_data')
    src = os.path.join(os.getcwd(), user_dir)
    dst = os.path.join(os.getcwd(), 'raw_data', user_dir)
    if os.path.exists(dst):
        print(f"Directory {dst} already exists, skipping.")
    else:
        print(f"Moving {src} to {dst}...")
        os.rename(src, dst)

In [None]:
# make each post data a subdirectory of the user directory
for user_dir in users:
    user_path = os.path.join('raw_data', user_dir)
    if not os.path.exists(user_path):
        print(f"User directory {user_path} does not exist, skipping.")
        continue
    organize_posts_by_timestamp(user_path)

In [None]:
# make each post a data in posts.json
import json

all_posts = []
# iterate through each user directory in raw_data
for user_dir in users:
    user_path = os.path.join('raw_data', user_dir)
    if not os.path.exists(user_path):
        print(f"User directory {user_path} does not exist, skipping.")
        continue
    
    # iterate through each post directory
    for post_dir in os.listdir(user_path):
        post_path = os.path.join(user_path, post_dir)
        if not os.path.isdir(post_path):
            continue
        
        post_data = process_post_directory(post_path)
        post_count += 1
        if post_data:
            all_posts.append(post_data)
            print(f"Processed post: {post_data['image_path']} by {post_data['author']}")
            
with open('final_data/data/posts.json', 'w', encoding='utf-8') as f:
    json.dump(all_posts, f, ensure_ascii=False, indent=4)
        