In [None]:
import gspread
from google.oauth2.service_account import Credentials
import tkinter as tk
from tkinter import messagebox, filedialog, ttk
from tkinter import font as tkfont
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import os
import time
import threading
from urllib.parse import urlparse
import re
from datetime import datetime


## Variables

In [None]:
driver = None

## GoogelSheet Credentials

In [None]:
json_path = "C:\\Users\\Yahya Negm\\Downloads\\scraping-googel.json"
scope = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive"]
creds = Credentials.from_service_account_file(json_path, scopes=scope)
client = gspread.authorize(creds)
spreadsheet_id = "1m34SuaD_LMjfsEpn8wjwuAyYVHfy7C0jM3BM1Gg24Mk"
sheet = client.open_by_key(spreadsheet_id).worksheet("Facebook")

data = sheet.get_all_records()
credentials = {entry["Username"]: str(entry["Password"]) for entry in data}

url_list = [{"Url": row["Url"], "Name": row["Name"]} for row in data if "Url" in row and "Name" in row]
url_display_values = [f"{item['Name']} - {item['Url']}" for item in url_list]


## Check log-in


In [None]:
def verify_login():
    username = username_entry.get()
    password = password_entry.get()
    if username in credentials and credentials[username] == password:
        messagebox.showinfo("Login Success", "Welcome!")
        login_frame.pack_forget()
        open_scraper_window()
        
        # Change geometry after successful login
        window_width = 700
        window_height = 800
        screen_width = root.winfo_screenwidth()
        screen_height = root.winfo_screenheight()

        # Calculate the position to center the window
        x = (screen_width // 2) - (window_width // 2)
        y = (screen_height // 2) - (window_height // 2)

        # Set the initial window position and size
        root.geometry(f"{window_width}x{window_height}+{x}+{y}")
        root.resizable(False, False)  # Lock the window size to prevent resizing

    else:
        messagebox.showerror("Login Failed", "Invalid Username or Password")


## extract text

In [None]:
def extract_text(element, xpath, default_value):
    try:
        return WebDriverWait(element, 5).until(EC.presence_of_element_located((By.XPATH, xpath))).text.strip()
    except TimeoutException:
        return default_value

## extract link

In [None]:
def extract_link(element, xpath, default_value):
    try:
        link = WebDriverWait(element, 5).until(EC.presence_of_element_located((By.XPATH, xpath))).get_attribute('href')
        return link.split("?")[0]
    except TimeoutException:
        return default_value

## extract photo_links

In [None]:
def extract_photo_links(element, xpath):
    try:
        return [photo.get_attribute("src") for photo in WebDriverWait(element, 5).until(EC.presence_of_all_elements_located((By.XPATH, xpath)))]
    except TimeoutException:
        return []

## Scrap Posts

In [None]:
def scrape_post_text(driver, num_posts, file_path, url, group_name):
    # Initialize an empty DataFrame to store scraped data
    df = pd.DataFrame(columns=["group_name", "poster_name", "text", "post_link", "profile_link", "photo_links"])
    collected_posts = 0
    driver.execute_script("document.body.style.zoom='30%'")
    

    # File path for output Excel file
    output_filename = os.path.join(file_path, "Facebook_Data.xlsx")

    # Load existing data if the file exists
    if os.path.exists(output_filename):
        existing_df = pd.read_excel(output_filename, engine='openpyxl')
    else:
        existing_df = pd.DataFrame(columns=["group_name", "poster_name", "text", "post_link", "profile_link", "photo_links"])

    while collected_posts < num_posts:
        posts = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, '//div[contains(@class, "x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z")]'))
        )
        for post in posts:
            if collected_posts >= num_posts:
                break
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", post)
                time.sleep(1)
                poster_name = extract_text(post, './/span[contains(@class, "xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x1hl2dhg x16tdsg8 x1vvkbs")]', "Name not found")
                post_text = extract_text(post, './/div[contains(@class, "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs x126k92a")]', "No text found in specified class.")
                profile_link = extract_link(post, './/a[contains(@href, "/profile.php") or contains(@href, "/groups/")]', "Profile link not found")
                post_link = extract_link(post, './/a[contains(@href, "/posts/")]', "Post link not found")
                photo_links = extract_photo_links(post, './/img[contains(@class, "xz74otr x1ey2m1c xds687c x5yr21d x10l6tqk x17qophe x13vifvy xh8yej3")]')

                # Check for duplicates in the combined data (existing + current)
                combined_df = pd.concat([existing_df, df], ignore_index=True)
                is_duplicate = not combined_df[(combined_df["poster_name"] == poster_name) & (combined_df["text"] == post_text)].empty
                Extract_Time_D = datetime.now().strftime('%d-%m-%Y %p')
                Extract_Time_T = datetime.now().strftime('%H:%M:%S')

                if not is_duplicate and post_text != "No text found in specified class." and post_text:
                    # Add the new post to the DataFrame
                    new_row = pd.DataFrame([{
                        "group_name": group_name,
                        "poster_name": poster_name,
                        "text": post_text,
                        "post_link": post_link,
                        "profile_link": profile_link,
                        "photo_links": ", ".join(photo_links),
                        "Extract-Day": Extract_Time_D,
                        "Extract-Time": Extract_Time_T
                    }])
                    df = pd.concat([df, new_row], ignore_index=True)
                    collected_posts += 1
                    log_message(f"Scraped post from: {poster_name}")
                else:
                    pass
            except Exception as e:
                log_message(f"Error processing post: {e}")
        driver.execute_script("window.scrollBy(0, 800);")
        posts = driver.find_elements(By.XPATH, '//div[contains(@class, "x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z")]')
        if posts:
            driver.execute_script("arguments[0].scrollIntoView(true);", posts[-1])  # Scroll to the last post####
             
    # Combine new data with existing data
    final_df = pd.concat([existing_df, df], ignore_index=True)
    final_df.to_excel(output_filename, index=False, engine='openpyxl')
    log_message(f"Data saved to {output_filename}")
    return df


## Logs

In [None]:
def log_message(message):
    log_text.insert(tk.END, message + '\n')
    log_text.see(tk.END)

## start Button

In [None]:
def start_scraping():
    global driver
    selected_indices = url_listbox.curselection()
    
    if not selected_indices:
        log_message("Please select at least one URL.")
        return

    num_posts = num_posts_entry.get()
    file_path = file_path_entry.get()

    if not num_posts.isdigit() or not file_path:
        log_message("Please enter the number of posts and file path.")
        return

    num_posts = int(num_posts)
    
    # Initialize the driver
    try:
        user_data_dir = os.path.join(os.path.expanduser('~'), 'User_Data')
        options = webdriver.ChromeOptions()
        service = Service()
        os.makedirs(user_data_dir, exist_ok=True)
        options.add_argument(f"user-data-dir={user_data_dir}")
        driver = webdriver.Chrome(service=service, options=options)
        driver.maximize_window()
    except Exception as e:
        log_message(f"Failed to initialize WebDriver: {e}")
        return

    # Main scraping loop
    try:
        for index in selected_indices:
            selected_url = url_list[index]["Url"]
            group_name = url_list[index]["Name"]  # Group name as a column
            
            try:
                driver.get(selected_url)
                log_message(f"Starting extraction for: {selected_url} (Group: {group_name})")
                df = scrape_post_text(driver, num_posts=num_posts, file_path=file_path, url=selected_url,group_name=group_name)
                log_message(f"Extraction completed for {selected_url} (Group: {group_name}).")
            except Exception as e:
                log_message(f"An error occurred on {selected_url} (Group: {group_name}): {e}")
    except Exception as e:
        log_message(f"Error during scraping loop: {e}")
    finally:
        driver.quit()
        log_message("Scraping completed for all selected URLs.")


def start_scraping_thread():
    thread = threading.Thread(target=start_scraping)
    thread.start()
    

## stop Button

In [None]:
def stop_scraping():
    global driver
    if driver:
        driver.quit()
        driver = None
        log_message("Scraping has been stopped and browser closed.")
    else:
        log_message("Scraping not in progress.")

## Broswer Button

In [None]:

def browse_path():
    selected_path = filedialog.askdirectory()
    if selected_path:
        file_path_entry.delete(0, tk.END)
        file_path_entry.insert(0, selected_path)

## Open Facebook Button

In [None]:
def open_facebook_login():
    global driver
    if driver is None:
        user_data_dir = os.path.join(os.path.expanduser('~'), 'User_Data')
        options = webdriver.ChromeOptions()
        service = Service()
        os.makedirs(user_data_dir, exist_ok=True)
        options.add_argument(f"user-data-dir={user_data_dir}")
        driver = webdriver.Chrome(service=service, options=options)
        driver.maximize_window()  
    driver.get("https://www.facebook.com/login")
    log_message("Opened Facebook login page. Please log in.")

In [None]:
def create_rounded_frame(parent, text):
    frame = tk.Frame(parent, bg="#ffffff", highlightthickness=2, highlightbackground="#e2e8f0", padx=10, pady=10)
    label = tk.Label(frame, text=text, font=("Helvetica", 10), bg="#ffffff", fg="#4a5568")
    label.pack(anchor="w", padx=10)
    return frame

## Frame design

In [None]:
def create_rounded_frame(parent, text):
    frame = tk.Frame(parent, bg="#ffffff", highlightthickness=2, highlightbackground="#e2e8f0", padx=10, pady=10)
    label = tk.Label(frame, text=text, font=("Helvetica", 10), bg="#ffffff", fg="#4a5568")
    label.pack(anchor="w", padx=10)
    return frame

## Funcation switch to Scrap GUI

In [None]:
def open_scraper_window():
    scraper_frame.pack(fill="both", expand=True)

## GUI

In [None]:
root = tk.Tk()
root.title("SBS Tool ")
##root.iconbitmap("C:\\Users\\Khaled\\Desktop\\sbs_logo.ico")
root.configure(bg="#f8f9fa")


window_width = 700
window_height = 270
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
x = (screen_width // 2) - (window_width // 2)
y = (screen_height // 2) - (window_height // 2)
root.geometry(f"{window_width}x{window_height}+{x}+{y}")
root.resizable(False, False) 


# Header
header = tk.Label(root, text="SBS Facebook Data ", font=("Helvetica", 15, "bold"), bg="#f8f9fa", fg="#3b82f6")
header.pack(pady=20)

# Login Frame
login_frame = create_rounded_frame(root, "Login")
login_frame.pack(padx=20, pady=10, fill="x")

username_entry = ttk.Entry(login_frame, width=40)
username_entry.pack(pady=5, ipady=4)
password_entry = ttk.Entry(login_frame, show="*", width=40)
password_entry.pack(pady=5, ipady=4)
login_button = tk.Button(login_frame, text="Login", command=verify_login, bg="#3b82f6", fg="white", font=("Helvetica", 10, "bold"), padx=20)
login_button.pack(pady=10)

# Scraper Frame
scraper_frame = create_rounded_frame(root, "Scraper Settings")
scraper_frame.pack(padx=20, pady=10, fill="x")

# URL Listbox
url_label = tk.Label(scraper_frame, text="Select Facebook Group URLs:", font=("Helvetica", 10), bg="#ffffff", fg="#4a5568")
url_label.pack(anchor="w", padx=10)
url_listbox = tk.Listbox(scraper_frame, selectmode=tk.MULTIPLE, width=50, height=10, bg="#f1faee", bd=1, relief="flat")
url_listbox.pack(padx=10, pady=5)

for item in url_display_values:
    url_listbox.insert(tk.END, item)

num_posts_frame = tk.Frame(scraper_frame, bg="#ffffff")
num_posts_frame.pack(anchor="w", padx=12, pady=5, fill="x")

# Label inside the frame
num_posts_label = tk.Label(num_posts_frame, text="Enter number of posts per URL:", font=("Helvetica", 10), bg="#ffffff", fg="#4a5568")
num_posts_label.pack(side="left")

# Entry inside the same frame
num_posts_entry = ttk.Entry(num_posts_frame, width=40)
num_posts_entry.pack(side="left", padx=10, ipady=4)

file_path_frame = tk.Frame(scraper_frame, bg="#ffffff")
file_path_frame.pack(anchor="w", padx=12, pady=5, fill="x")

# Label for file path inside the frame
file_path_label = tk.Label(file_path_frame, text="Enter file path to save data:", font=("Helvetica", 10), bg="#ffffff", fg="#4a5568")
file_path_label.pack(side="left")

# Entry for file path inside the same frame
file_path_entry = ttk.Entry(file_path_frame, width=40)
file_path_entry.pack(side="left", padx=35, ipady=4)

# Browse button inside the same frame
browse_button = tk.Button(file_path_frame, text="Browse", command=browse_path, bg="#3b82f6", fg="white", font=("Helvetica", 10, "bold"))
browse_button.pack(side="left", padx=10)

# Control Buttons
login_facebook_button = tk.Button(scraper_frame, text="Log In to Facebook", command=open_facebook_login, bg="#3b82f6", fg="white", font=("Helvetica", 10, "bold"), padx=20)
login_facebook_button.pack(pady=5)

start_button = tk.Button(scraper_frame, text="Start Scraping", command=start_scraping_thread, bg="#2a9d8f", fg="white", font=("Helvetica", 10, "bold"), padx=20)
start_button.pack(pady=10)

stop_button = tk.Button(scraper_frame, text="Stop Scraping", command=stop_scraping, bg="#e63946", fg="white", font=("Helvetica", 10, "bold"), padx=20)
stop_button.pack(pady=5)

# Log display
log_frame = create_rounded_frame(root, "Logs")
log_frame.pack(padx=20, pady=10, fill="x")
log_text = tk.Text(log_frame, wrap="word", height=10, width=60, bg="#f1faee", font=("Helvetica", 9), bd=0, relief="flat")
log_text.pack(pady=10)
log_text.insert(tk.END, "Logs:\n")

scraper_frame.pack_forget()  # Hide the scraper frame initially


## Run GUI

In [None]:
root.mainloop()