In [None]:
import os
import json
import time
import random
from threading import Thread, Lock
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from tqdm import tqdm

# =====================================================================================================

lock = Lock()
save_counter = 0
total_scraped = 0

# =====================================================================================================

# Rənglər (ANSI escape kodları)
colors = ["\033[91m", "\033[92m", "\033[93m", "\033[94m", "\033[95m", "\033[96m", "\033[97m"]
RESET = "\033[0m"

# =====================================================================================================

def save_data(new_data, filename="imdb_review.json"):
    global save_counter, total_scraped
    with lock:
        if os.path.exists(filename):
            with open(filename, "r", encoding="utf-8") as f:
                try:
                    old_data = json.load(f)
                except:
                    old_data = []
        else:
            old_data = []

        all_data = old_data + new_data
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(all_data, f, ensure_ascii=False, indent=2)
        
        save_counter += 1
        total_scraped = len(all_data)
        print(f"\n📁 SAVE #{save_counter}: Added {len(new_data)} movies | Total: {len(all_data)} movies\n")

# =====================================================================================================

def load_existing_links(filename="imdb_review.json"):
    if os.path.exists(filename):
        with open(filename, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                return {d["URL"] for d in data if "URL" in d}
            except:
                return set()
    return set()

# =====================================================================================================

def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-logging")
    options.add_argument("--disable-web-security")
    options.add_argument("--allow-running-insecure-content")
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.default_content_setting_values.notifications": 2
    }
    options.add_experimental_option("prefs", prefs)
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    driver = webdriver.Chrome(
        service=Service(executable_path=ChromeDriverManager().install()),
        options=options
    )
    
    # Anti-detection
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

# =====================================================================================================

def scroll_js(driver, pixels=3000):
    driver.execute_script(f"window.scrollBy(0, {pixels});")

# =====================================================================================================

def click_element_safely(driver, element):
    try:
        driver.execute_script("arguments[0].click();", element)
        return True
    except:
        try:
            element.click()
            return True
        except:
            return False

# =====================================================================================================

def close_rate_popup(driver):
    try:
        popups = [
            "button[data-testid='rate-dismiss']",
            "button[aria-label='Close']",
            ".ipc-promptable-base__close-button",
            ".ipc-modal__close-button"
        ]
        for popup_selector in popups:
            try:
                popup_close = driver.find_element(By.CSS_SELECTOR, popup_selector)
                if popup_close.is_displayed():
                    click_element_safely(driver, popup_close)
                    time.sleep(0.5)
                    break
            except:
                continue
    except:
        pass

# =====================================================================================================

def get_all_movie_links(driver, category_url, color):
    """Kategoriyadakı bütün film linklərini topla"""
    try:
        driver.get(category_url)
        time.sleep(random.uniform(3, 5))
        close_rate_popup(driver)
    except Exception as e:
        print(f"{color}[ERROR] Failed to load {category_url}: {e}{RESET}")
        return set()

    all_links = set()
    scroll_count = 0
    see_more_count = 0
    max_scrolls = 150  
    no_new_products_count = 0
    manual_see_more_clicks = 10  

    print(f"{color}[{category_url.split('=')[-1].upper()}] Starting link collection...{RESET}")

    while scroll_count < max_scrolls:
        # Səhifəni scroll et
        scroll_js(driver, 4000)
        time.sleep(random.uniform(2, 4))
        close_rate_popup(driver)

        # "Load More" düyməsini tap və bas
        if see_more_count < manual_see_more_clicks:
            try:
                load_more_selectors = [
                    "div.sc-5fb85acc-0 span button",
                    ".ipc-see-more__button",
                    "button.ipc-see-more__button",
                    ".ipc-see-more button"
                ]
                
                for selector in load_more_selectors:
                    try:
                        load_more = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                        if load_more.is_displayed():
                            if click_element_safely(driver, load_more):
                                see_more_count += 1
                                print(f"{color}[{category_url.split('=')[-1].upper()}] 'Load More' clicked {see_more_count}/{manual_see_more_clicks}{RESET}")
                                time.sleep(random.uniform(3, 5))
                                break
                    except:
                        continue
            except:
                pass

        # Film linklərini topla
        link_selectors = [
            ".dli-title.with-margin a",
            ".ipc-title-link-wrapper",
            "a.ipc-title-link-wrapper",
            ".cli-title a",
            "h3.ipc-title a"
        ]
        
        current_links = set()
        for selector in link_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for e in elements:
                    href = e.get_attribute("href")
                    if href and "/title/tt" in href and "?" in href:
                        # URL-ni təmizlə
                        clean_href = href.split("?")[0]
                        current_links.add(clean_href)
            except:
                continue

        new_products = current_links - all_links

        if new_products:
            all_links.update(new_products)
            no_new_products_count = 0
            print(f"{color}[{category_url.split('=')[-1].upper()}] + {len(new_products)} new | Total: {len(all_links)}{RESET}")
        else:
            no_new_products_count += 1

        # Əgər 10 dəfə yeni link tapılmasa və kifayət qədər link toplandısa, dayanırıq
        if no_new_products_count >= 10 and len(all_links) > 200:
            print(f"{color}[{category_url.split('=')[-1].upper()}] No new links for 10 iterations. Stopping...{RESET}")
            break

        scroll_count += 1

    print(f"{color}[{category_url.split('=')[-1].upper()}] ✅ Total links collected: {len(all_links)}{RESET}")
    return all_links

# =====================================================================================================

def scrape_movie_reviews(driver, movie_url, color, genre):
    """Bir filmin bütün rəylərini topla"""
    try:
        driver.set_page_load_timeout(45)
        driver.get(movie_url)
        time.sleep(random.uniform(2, 4))
        close_rate_popup(driver)
    except TimeoutException:
        print(f"{color}[{genre}] ⏱️  TIMEOUT: {movie_url[-15:]}{RESET}")
        return None
    except Exception as e:
        print(f"{color}[{genre}] ❌ ERROR: {movie_url[-15:]} - {e}{RESET}")
        return None

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    data = {"URL": movie_url, "Genre": genre}

    # Film adı
    name_selectors = [
        "span.hero__primary-text",
        "h1[data-testid='hero-title-block__title']",
        ".sc-b73cd867-0 h1",
        "h1.sc-b73cd867-0"
    ]
    
    for selector in name_selectors:
        name = soup.select_one(selector)
        if name:
            data["Name"] = name.get_text(strip=True)
            break
    else:
        data["Name"] = "Unknown"

    # Reytinq
    rating_selectors = [
        "span.ipc-rating-star--rating",
        ".sc-7ab21ed2-1 span",
        "[data-testid='hero-rating-bar__aggregate-rating__score'] span"
    ]
    
    for selector in rating_selectors:
        star_rate = soup.select_one(selector)
        if star_rate:
            try:
                rating_text = star_rate.get_text(strip=True)
                data["Star_Rating"] = float(rating_text)
            except:
                data["Star_Rating"] = None
            break
    else:
        data["Star_Rating"] = None

    # Təsvir
    description_selectors = [
        "span.sc-bf30a0e-2.bRimta",
        "[data-testid='plot-xs_to_m'] span",
        ".ipc-html-content-inner-div",
        "[data-testid='plot'] span"
    ]
    
    for selector in description_selectors:
        description = soup.select_one(selector)
        if description:
            data["Description"] = description.get_text(strip=True)
            break
    else:
        data["Description"] = None

    # User Reviews səhifəsinə get
    reviews_list = []
    try:
        # User reviews linkini tap
        user_reviews_selectors = [
            "a[href*='reviews']",
            "a[data-testid='reviews-header']"
        ]
        
        user_reviews_button = None
        for selector in user_reviews_selectors:
            try:
                user_reviews_button = WebDriverWait(driver, 8).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
                if user_reviews_button.is_displayed():
                    break
            except:
                continue
        
        # XPath ilə də cəhd et
        if not user_reviews_button:
            try:
                user_reviews_button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'User reviews') or contains(text(), 'reviews')]")))
            except:
                pass

        if user_reviews_button:
            click_element_safely(driver, user_reviews_button)
            time.sleep(random.uniform(3, 5))
            close_rate_popup(driver)

            # Bütün rəyləri yükləmək üçün scroll et
            last_review_count = 0
            no_new_reviews_count = 0
            max_load_attempts = 15  # Artırıldı
            
            while no_new_reviews_count < max_load_attempts:
                # Səhifəni aşağı scroll et
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(random.uniform(2, 4))
                close_rate_popup(driver)
                
                # "Load More" düyməsini tap
                try:
                    load_more_selectors = [
                        ".ipc-see-more__button",
                        "button.ipc-see-more__button",
                        ".sc-e2b012eb-0 button",
                        "button[data-testid='load-more-trigger']"
                    ]
                    
                    load_more_found = False
                    for selector in load_more_selectors:
                        try:
                            load_more = driver.find_element(By.CSS_SELECTOR, selector)
                            if load_more.is_displayed() and load_more.is_enabled():
                                if click_element_safely(driver, load_more):
                                    load_more_found = True
                                    time.sleep(random.uniform(3, 5))
                                    break
                        except:
                            continue
                    
                    if not load_more_found:
                        no_new_reviews_count += 1
                        
                except:
                    no_new_reviews_count += 1
                
                # Hazırda səhifədəki rəy sayını yoxla
                current_reviews = driver.find_elements(By.CSS_SELECTOR, ".ipc-html-content-inner-div")
                current_count = len(current_reviews)
                
                if current_count > last_review_count:
                    last_review_count = current_count
                    no_new_reviews_count = 0
                    if current_count % 50 == 0:  # Hər 50 rəydən bir göster
                        print(f"{color}[{genre}] 📝 {current_count} reviews loaded for {data['Name'][:30]}{RESET}")
                else:
                    no_new_reviews_count += 1

            # Rəyləri topla
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Rəy mətnləri
            reviews = soup.select(".ipc-html-content-inner-div")
            
            # Reytinqlər
            ratings = soup.select("span.ipc-rating-star--rating")
            
            print(f"{color}[{genre}] ✅ {len(reviews)} reviews scraped for '{data['Name'][:40]}'{RESET}")
            
            # Rəyləri birləşdir
            for i, review in enumerate(reviews):
                review_text = review.get_text(strip=True)
                
                # Reytinqi tap
                rating = None
                if i < len(ratings):
                    try:
                        rating_text = ratings[i].get_text(strip=True)
                        rating = float(rating_text)
                    except (ValueError, AttributeError):
                        rating = None
                
                if review_text and len(review_text) > 10:  # Çox qısa rəyləri əlavə etmə
                    reviews_list.append({
                        "Review": review_text,
                        "Rating": rating
                    })
        else:
            print(f"{color}[{genre}] ⚠️  No reviews button found for {data['Name'][:30]}{RESET}")

    except Exception as e:
        print(f"{color}[{genre}] ❌ Reviews scraping failed for {data['Name'][:30]}: {e}{RESET}")

    data["Reviews"] = reviews_list
    data["Review_Count"] = len(reviews_list)
    return data

# =====================================================================================================

def scrap_category(category_url, processed_links, list_data, position, color):
    """Bir kategoriyanın bütün filmlərini scrap et"""
    genre = category_url.split('=')[-1].upper()
    driver = get_driver()
    
    try:
        # Əvvəlcə bütün film linklərini topla
        all_links = get_all_movie_links(driver, category_url, color)
        
        if not all_links:
            print(f"{color}[{genre}] ❌ No links found{RESET}")
            return
        
        # Əvvəldən emal edilmiş linkləri çıxar
        with lock:
            new_links = list(all_links - processed_links)
            processed_links.update(new_links)
        
        print(f"{color}[{genre}] 🎯 Processing {len(new_links)} new movies{RESET}")
        
        if not new_links:
            print(f"{color}[{genre}] ℹ️  No new movies to process{RESET}")
            return
        
        # Hər bir filmi emal et
        processed_count = 0
        batch_data = []
        
        for i, link in enumerate(new_links, 1):
            try:
                movie_data = scrape_movie_reviews(driver, link, color, genre)
                
                if movie_data and movie_data.get("Reviews"):
                    batch_data.append(movie_data)
                    processed_count += 1
                    
                    print(f"{color}[{genre}] 📊 Progress: {i}/{len(new_links)} | Successful: {processed_count} | Reviews: {movie_data.get('Review_Count', 0)}{RESET}")
                    
                    # Hər 5 filmdən sonra save et
                    if len(batch_data) >= 5:
                        save_data(batch_data.copy())
                        batch_data.clear()
                else:
                    print(f"{color}[{genre}] ⚠️  Skipped: {link[-20:]} (No reviews){RESET}")
                
                # Random delay
                time.sleep(random.uniform(1, 3))
                
            except Exception as e:
                print(f"{color}[{genre}] ❌ Error processing {link[-20:]}: {e}{RESET}")
                continue
        
        # Qalan məlumatları save et
        if batch_data:
            save_data(batch_data)
        
        print(f"{color}[{genre}] 🎉 COMPLETED: {processed_count}/{len(new_links)} movies successfully scraped{RESET}")
        
    except Exception as e:
        print(f"{color}[{genre}] ❌ Category scraping failed: {e}{RESET}")
    finally:
        driver.quit()

# =====================================================================================================

# URL-lər (bütün janrlar)
urls = [
    # "https://www.imdb.com/search/title/?genres=action",
    # "https://www.imdb.com/search/title/?genres=adventure",
    # "https://www.imdb.com/search/title/?genres=animation",
    # "https://www.imdb.com/search/title/?genres=biography",
    # "https://www.imdb.com/search/title/?genres=comedy",
    # "https://www.imdb.com/search/title/?genres=crime",
    # "https://www.imdb.com/search/title/?genres=documentary",
    # "https://www.imdb.com/search/title/?genres=drama",
    "https://www.imdb.com/search/title/?genres=family",
    "https://www.imdb.com/search/title/?genres=fantasy",
    "https://www.imdb.com/search/title/?genres=film-noir",
    "https://www.imdb.com/search/title/?genres=game-show",
    "https://www.imdb.com/search/title/?genres=history",
    "https://www.imdb.com/search/title/?genres=horror",
    "https://www.imdb.com/search/title/?genres=music",
    "https://www.imdb.com/search/title/?genres=musical",
    "https://www.imdb.com/search/title/?genres=mystery",
    # "https://www.imdb.com/search/title/?genres=news",
    # "https://www.imdb.com/search/title/?genres=reality-tv",
    # "https://www.imdb.com/search/title/?genres=romance",
    # "https://www.imdb.com/search/title/?genres=sci-fi",
    # "https://www.imdb.com/search/title/?genres=short",
    # "https://www.imdb.com/search/title/?genres=sport",
    # "https://www.imdb.com/search/title/?genres=talk-show",
    # "https://www.imdb.com/search/title/?genres=thriller",
    # "https://www.imdb.com/search/title/?genres=war",
    # "https://www.imdb.com/search/title/?genres=western"
]

# =====================================================================================================

if __name__ == "__main__":
    processed_links = load_existing_links()
    list_data = []
    
    print(f"🎬 Starting Enhanced IMDb Scraper")
    print(f"🎯 Genres: {len(urls)}")
    print(f"📊 Already processed: {len(processed_links)} movies")
    print(f"🚀 Starting parallel scraping...\n")
    
    threads = []
    
    # Hər bir URL üçün thread yarat
    for idx, url in enumerate(urls):
        color = colors[idx % len(colors)]
        t = Thread(target=scrap_category, args=(url, processed_links, list_data, idx, color))
        t.start()
        threads.append(t)
        time.sleep(1)  # Thread-lər arası delay
    
    # Bütün thread-ləri gözlə
    for i, t in enumerate(threads):
        t.join()
        print(f"✅ Thread {i+1} completed")
    
    # Final save
    if list_data:
        save_data(list_data)
    
    print(f"\n🎉 ALL GENRES COMPLETED!")
    print(f"📁 Data saved to: imdb_review.json")
    print(f"📊 Total movies scraped: {total_scraped}")
    print(f"💾 Total saves performed: {save_counter}")

🎬 Starting Enhanced IMDb Scraper
🎯 Genres: 9
📊 Already processed: 730 movies
🚀 Starting parallel scraping...

[91m[FAMILY] Starting link collection...[0m
[92m[FANTASY] Starting link collection...[0m
[93m[FILM-NOIR] Starting link collection...[0m
[91m[FAMILY] 'Load More' clicked 1/8[0m
[94m[GAME-SHOW] Starting link collection...[0m
[92m[FANTASY] 'Load More' clicked 1/8[0m
[93m[FILM-NOIR] 'Load More' clicked 1/8[0m
[94m[GAME-SHOW] 'Load More' clicked 1/8[0m
[96m[HORROR] Starting link collection...[0m
[95m[HISTORY] Starting link collection...[0m
[96m[HORROR] 'Load More' clicked 1/8[0m
[95m[HISTORY] 'Load More' clicked 1/8[0m
[91m[MUSICAL] Starting link collection...[0m
[97m[MUSIC] Starting link collection...[0m
[91m[MUSICAL] 'Load More' clicked 1/8[0m
[97m[MUSIC] 'Load More' clicked 1/8[0m
[91m[FAMILY] + 100 new | Total: 100[0m
[92m[FANTASY] + 100 new | Total: 100[0m
[94m[GAME-SHOW] + 100 new | Total: 100[0m
[93m[FILM-NOIR] + 100 new | Total: 100[0m