In [None]:
# !pip install selenium webdriver-manager


In [20]:
# -------------------------------
# Helper Functions
# -------------------------------

def human_delay(a=2, b=4):
    time.sleep(random.uniform(a, b))

def download_image(image_url):
    """Download an image from image_url to a temporary file and return its file path."""
    try:
        response = requests.get(image_url)
        if response.status_code == 200:
            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
            temp_file.write(response.content)
            temp_file.close()
            return temp_file.name
        else:
            print(f"Failed to download image. Status code: {response.status_code}")
    except Exception as e:
        print("Error downloading image:", e)
    return None

# -------------------------------
# Analysis Function using Ollama with English Prompt
# -------------------------------

def analyze_post(text, image_url, post_time):
    """
    Use Ollama's Llama-3.2-Vision model to analyze the post.
    The prompt instructs the model (in English) to extract:
      - station: The brand or name of the gas station (if mentioned), sometimes not just extract the name but to infer the full name (eg petro -> petro canada); else null.
      - intersection: The road or intersection mentioned (e.g. an address); else null.
      - gps: If GPS coordinates (formatted as "lat,lon") can be inferred from the intersection, return them; else null.
      - line_flag: true if the post indicates there is a queue/line; otherwise false.
      - oil_truck_flag: true if the post mentions an oil truck is present; otherwise false.
      - time_since_start: "current" if the event has just begun or updated comments indicate a recent change; otherwise a time offset or "unknown".
      - gas_price: The per-unit gas price as seen in the image; if not visible or inferable, "unknown".
    """
    prompt = f"""You are an expert tasked with extracting gas station information from social media posts.
You must respond with a JSON object ONLY, with no additional commentary or markdown formatting.

the general post structure is as follows the first post is the original post and the second post and later posts are  comment on the original post for extra information or updates:
``匿名成員 = name of the poster
12小時
 
 · 
Petro Bovaird and Mississauga Rd. Was good at 9pm = text of the post
所有心情：
3
3
1 個回應  = number of likes and comments
讚好
回應
傳送
Rai Quan = name of the commenter (if any) 
Not good = comment (update)
11小時 
讚好
回覆
``
Extract the following fields:
- "station": The brand or name of the gas station mentioned in the post or from the image (if any); otherwise null.
- "intersection": The road or intersection mentioned (e.g. an address or intersection); if not mentioned, null.
- "gps": If GPS coordinates (formatted as "lat,lon") can be inferred from the intersection, return them; otherwise null.
- "line_flag": true if the post indicates there is a queue/line; otherwise false.
- "oil_truck_flag": true if the post mentions an oil truck is present; otherwise false.
- "status": If the text indicates that the event has just begun (e.g. "the staff just put on the sticker") or is updated by comments showing a change (e.g. "back to normal" or "out of gas" or "no gas available" or "ended" or "not good" etc.), return "ended"; otherwise "unknown".
- "gas_price": The per-unit gas price as seen in the image or from the post. If not directly visible or inferable, return "unknown".

Input details:
- Post time: "{post_time}"
- Post text: "{text}"
- Image: Provided below (if available).

Please return only a valid JSON object with the fields described above.
"""
    # Print out the input prompt for debugging.
    print("\n=== Input Prompt to Ollama ===")
    print(prompt)
    
    # Download image locally if an image URL is provided.
    local_image_path = None
    if image_url:
        local_image_path = download_image(image_url)

    try:
        messages = [{
            "role": "user",
            "content": prompt,
        }]
        if local_image_path:
            messages[0]["images"] = [local_image_path]
        
        response = ollama.chat(model="llama3.2-vision", messages=messages, options={"temperature": 0})
        
        # Debug: print the raw response from Ollama.
        print("\n=== Raw Ollama Response ===")
        print(response)
        
        # Now, retrieve the JSON string from response.message.content.
        raw_output = ""
        if "message" in response and "content" in response["message"]:
            raw_output = response["message"]["content"].strip()
        
        if not raw_output:
            print("Empty output from Ollama model.")
            return {}
        
        try:
            result = json.loads(raw_output)
        except json.JSONDecodeError as je:
            print("JSON decoding error:", je)
            print("Model output was:")
            print(raw_output)
            result = {}
    except Exception as e:
        print("Error calling or parsing response from Ollama model:", e)
        result = {}
    finally:
        if local_image_path and os.path.exists(local_image_path):
            os.remove(local_image_path)
    return result


In [None]:
import ollama
import json
import random
import time
import pickle
import os
import requests
import tempfile
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

# Facebook credentials and group URL (update with your details)

GROUP_URL = "https://www.facebook.com/groups/1982935245273808/?sorting_setting=CHRONOLOGICAL"
COOKIES_FILE = "fb_cookies.pkl"

# A queue for image URLs (for later processing)
image_queue = []


# -------------------------------
# Main Scraping Code: Collect All Posts Then Batch Process
# -------------------------------

options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# Uncomment for headless mode:
options.add_argument("--headless")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
actions = ActionChains(driver)

# List to store all posts' data
posts_data = []

try:
    # 1. Navigate to Facebook to set cookie domain.
    driver.get("https://www.facebook.com/")
    human_delay(2, 3)

    # 2. Load cookies if available.
    if os.path.exists(COOKIES_FILE):
        print("Loading cookies...")
        with open(COOKIES_FILE, "rb") as f:
            cookies = pickle.load(f)
        for cookie in cookies:
            if 'sameSite' in cookie and cookie['sameSite'] == 'None':
                cookie['sameSite'] = 'Strict'
            try:
                driver.add_cookie(cookie)
            except Exception as e:
                print("Error adding cookie:", e)
        driver.refresh()
        human_delay(3, 5)
    else:
        print("No cookies found. Logging in manually...")
        email_input = driver.find_element(By.ID, "email")
        password_input = driver.find_element(By.ID, "pass")
        email_input.send_keys(FB_EMAIL)
        human_delay(1, 2)
        password_input.send_keys(FB_PASSWORD)
        human_delay(1, 2)
        password_input.send_keys(Keys.RETURN)
        human_delay(5, 7)
        input("Complete any 2FA in the browser, then press Enter to continue...")
        cookies = driver.get_cookies()
        with open(COOKIES_FILE, "wb") as f:
            pickle.dump(cookies, f)
        print("Cookies saved for future sessions.")

    # 3. Navigate to the target Facebook group.
    driver.get(GROUP_URL)
    human_delay(5, 7)

    # 4. Slowly scroll to load posts.
    for _ in range(3):
        current_height = driver.execute_script("return document.body.scrollHeight")
        scroll_increment = random.randint(300, 800)
        for pos in range(0, current_height, scroll_increment):
            driver.execute_script("window.scrollTo(0, arguments[0]);", pos)
            human_delay(0.2, 0.5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        human_delay(3, 5)

    # 5. Locate post containers.
    posts = driver.find_elements(By.XPATH, "//div[@role='article']")
    print(f"Found {len(posts)} posts.")

    # For development, we collect only the first 10 posts.
    posts = posts[:1]
    print(f"Collecting only the first {len(posts)} posts for development.")

    # 6. Extract data from each post and store in posts_data.
    for i in range(len(posts)):
        attempts = 0
        post_info = {"text": "", "image_urls": [], "post_time": ""}
        while attempts < 3:
            try:
                # Re-fetch posts to avoid stale element issues.
                posts = driver.find_elements(By.XPATH, "//div[@role='article']")
                post = posts[i]
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", post)
                human_delay(1, 2)

                # Expand "See More" if available.
                try:
                    more_links = post.find_elements(By.XPATH, ".//a[contains(text(), 'See More')]")
                    if more_links:
                        actions.move_to_element(more_links[0]).perform()
                        human_delay(0.5, 1)
                        driver.execute_script("arguments[0].click();", more_links[0])
                        human_delay(2, 3)
                except Exception as e:
                    print(f"Error clicking 'See More' in post {i+1}: {e}")

                # Get post text.
                post_text = driver.execute_script("return arguments[0].innerText;", post)
                post_info["text"] = post_text.strip() if post_text.strip() else ""
                
                # For demonstration, we use a placeholder for post time.
                post_info["post_time"] = "10 hours ago"  # Replace with actual extraction logic as needed.

                # Extract image URLs.
                try:
                    img_tags = post.find_elements(By.XPATH, ".//img")
                    image_urls = []
                    if img_tags:
                        for img in img_tags:
                            src = img.get_attribute("src")
                            if src and src not in image_queue:
                                image_queue.append(src)
                                image_urls.append(src)
                    post_info["image_urls"] = image_urls
                except Exception as e:
                    print(f"Error extracting images in post {i+1}: {e}")

                # Debug: Print snippet of post text and inner HTML.
                inner_html = post.get_attribute("innerHTML")
                print(f"\n=== Post {i+1} ===")
                print("Post text:")
                print(post_info["text"] if post_info["text"] else "(No text found)")
                print("Inner HTML snippet:")
                print(inner_html[:300] + "..." if len(inner_html) > 300 else inner_html)
                break  # Successfully extracted post info.
            except StaleElementReferenceException:
                print(f"StaleElementReferenceException encountered in post {i+1}. Retrying...")
                human_delay(1, 2)
                attempts += 1
        else:
            print(f"Failed to process post {i+1} after several retries.")
        
        posts_data.append(post_info)

    print("\n=== Finished collecting post data ===")
    print(f"Total posts collected: {len(posts_data)}")

    # -------------------------------
    # Batch Process Posts using Ollama
    # -------------------------------
    print("\n=== Batch processing posts (using Ollama) ===")
    analysis_results = []
    for idx, post in enumerate(posts_data, start=1):
        # For image analysis, take the first image URL if available.
        image_url = post["image_urls"][0] if post["image_urls"] else None
        analysis = analyze_post(post["text"], image_url, post["post_time"])
        analysis_results.append(analysis)
        print(f"\n=== Analysis result for post {idx} ===")
        print(json.dumps(analysis, indent=2, ensure_ascii=False))
        human_delay(2, 4)

    # Optionally, display the full image queue.
    print("\n=== Image Queue (for later processing) ===")
    for idx, url in enumerate(image_queue, start=1):
        print(f"{idx}. {url}")

finally:
    driver.quit()


Loading cookies...
Found 56 posts.
Collecting only the first 1 posts for development.

=== Post 1 ===
Post text:
匿名成員
13小時
 
 · 
Petro Bovaird and Mississauga Rd. Was good at 9pm
所有心情：
3
3
1 個回應
讚好
回應
傳送
Rai Quan
Not good
11小時
讚好
回覆




以 Kevin Lam 的身分回應
Inner HTML snippet:
<div class="x78zum5 xdt5ytf" data-virtualized="false" style=""><div><div class="html-div xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x78zum5 x1n2onr6 xh8yej3"><div class="x1n2onr6 x1ja2u2z x1jx94hy x1qpq9i9 xdney7k xu5ydu1 xt3gfkd x9f619 xh8yej3 x6ikm8r x10wlt62 xquyuld" style=...

=== Finished collecting post data ===
Total posts collected: 1

=== Batch processing posts (using Ollama) ===

=== Input Prompt to Ollama ===
You are an expert tasked with extracting gas station information from social media posts.
You must respond with a JSON object ONLY, with no additional commentary or markdown formatting.

Extract the following fields:
- "station": The brand or name of the gas station mentioned i

In [36]:
image_link  = "https://scontent-yyz1-1.xx.fbcdn.net/v/t39.30808-6/481478506_10233214677856295_4449808216704385674_n.jpg?stp=cp6_dst-jpegr_p526x296_tt6&_nc_cat=110&ccb=1-7&_nc_sid=aa7b47&_nc_ohc=KczbmWSByakQ7kNvgHFlwSF&_nc_oc=AdiynByPy9TZM0pCpN6pJ3C_zhM73O5sRut_oGXIs9wqZMDhDqIBe2hA8J06-e53JzI&_nc_zt=23&se=-1&_nc_ht=scontent-yyz1-1.xx&_nc_gid=AttlxX2sc-S58_aDQQsj5Wo&oh=00_AYCaeB-ajBhDu7bssX98KZ3Y-GBtF5wAScnIv66azJs9jg&oe=67C3FC47"

In [17]:
post_text = """匿名成員
12小時
 
 · 
Petro Bovaird and Mississauga Rd. Was good at 9pm
所有心情：
3
3
1 個回應
讚好
回應
傳送
Rai Quan
Not good
11小時
讚好
回覆




以 Kevin Lam 的身分回應"""

In [21]:
analysis = analyze_post(post_text, None, "10 hours ago")


=== Input Prompt to Ollama ===
You are an expert tasked with extracting gas station information from social media posts.
You must respond with a JSON object ONLY, with no additional commentary or markdown formatting.

the general post structure is as follows the first post is the original post and the second post and later posts are  comment on the original post for extra information or updates:
``匿名成員 = name of the poster
12小時
 
 · 
Petro Bovaird and Mississauga Rd. Was good at 9pm = text of the post
所有心情：
3
3
1 個回應  = number of likes and comments
讚好
回應
傳送
Rai Quan = name of the commenter (if any) 
Not good = comment (update)
11小時 
讚好
回覆
``
Extract the following fields:
- "station": The brand or name of the gas station mentioned in the post or from the image (if any); otherwise null.
- "intersection": The road or intersection mentioned (e.g. an address or intersection); if not mentioned, null.
- "gps": If GPS coordinates (formatted as "lat,lon") can be inferred from the intersection,

In [19]:
analysis

{'station': 'Petro',
 'intersection': 'Bovaird and Mississauga Rd.',
 'gps': None,
 'line_flag': False,
 'oil_truck_flag': False,
 'status': 'ended',
 'gas_price': 'unknown'}