# Web Scraping IMDb - Top 250 Séries

### Grupo: Giovani Cancherini, Eduardo Traunig, Vinicius Quintian, João Pedro Fossa
### Data de Entrega: 23/04/2024 (Turma 10)

---

## Tarefa 1 - Obter as 250 melhores séries do IMDb

In [16]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time
import json
import re
import os

In [17]:
def sanitize_filename(filename):
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

In [18]:
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Executa em modo headless
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), 
                             options=chrome_options)
    return driver

In [None]:
def scrape_top_250_series():
    driver = setup_driver()
    url = "https://www.imdb.com/chart/toptv/"
    driver.get(url)
    time.sleep(3)  # Espera a página carregar
    
    series_list = []
    
    series_elements = driver.find_elements(By.CSS_SELECTOR, ".ipc-metadata-list-summary-item__c")
    
    for serie in series_elements[:250]:  # Limita às 250 primeiras
        try:
            # Extrai título
            title_element = sanitize_filename(serie.find_element(By.CSS_SELECTOR, "h3.ipc-title__text"))
            title = title_element.text.split('. ', 1)[1]  # Remove o número do ranking
            
            # Extrai ano e duração
            metadata = serie.find_element(By.CSS_SELECTOR, ".cli-title-metadata")
            year = metadata.find_elements(By.TAG_NAME, "span")[0].text
            episodes = sanitize_filename(metadata.find_elements(By.TAG_NAME, "span")[1].text) if len(metadata.find_elements(By.TAG_NAME, "span")) > 1 else "N/A"
            
            # Extrai nota
            rating_element = serie.find_element(By.CSS_SELECTOR, "[data-testid='ratingGroup--imdb-rating']")
            rating = rating_element.text.split('(')[0].strip()
            
            # Extrai link
            link = serie.find_element(By.CSS_SELECTOR, "a.ipc-title-link-wrapper").get_attribute("href")
            
            series_data = {
                "title": title,
                "year": int(year) if year.isdigit() else year,
                "episodes": episodes,
                "rating": float(rating) if rating.replace('.', '').isdigit() else rating,
                "url": link
            }
            
            series_list.append(series_data)
            
        except Exception as e:
            print(f"Erro ao extrair dados da série: {e}")
            continue
    
    driver.quit()
    return series_list

top_series = scrape_top_250_series()

In [None]:
def save_series_data(series_data):
    os.makedirs("imdb_data", exist_ok=True)
    with open("imdb_data/top_250_series.json", "w", encoding="utf-8") as f:
        json.dump(series_data, f, indent=4, ensure_ascii=False)

In [None]:
# Executa a coleta das 250 melhores séries
top_series = scrape_top_250_series()
save_series_data(top_series)
print(f"Dados de {len(top_series)} séries coletados e salvos com sucesso!")

---

## Tarefa 2 - Obter detalhes adicionais de cada série

In [None]:
def scrape_series_details(series_list):
    driver = setup_driver()
    
    for serie in series_list:
        try:
            driver.get(serie["url"])
            time.sleep(2)  # Espera para evitar bloqueio
            
            # Extrai popularidade
            try:
                popularity = driver.find_element(By.XPATH, "//div[contains(text(),'Popularity')]/following-sibling::div").text
                serie["popularity"] = int(popularity.replace(',', '')) if popularity.replace(',', '').isdigit() else popularity
            except:
                serie["popularity"] = "N/A"
            
            # Extrai elenco principal
            try:
                cast_section = driver.find_element(By.CSS_SELECTOR, "section[data-testid='title-cast']")
                cast_elements = cast_section.find_elements(By.CSS_SELECTOR, ".ipc-metadata-list__item")
                
                cast_list = []
                for cast in cast_elements[:10]:  # Limita aos 10 primeiros
                    actor = cast.find_element(By.CSS_SELECTOR, "[data-testid='title-cast-item__actor']").text
                    character = cast.find_element(By.CSS_SELECTOR, "[data-testid='title-cast-item__character']").text
                    cast_list.append({"actor": actor, "character": character})
                
                serie["cast"] = cast_list
            except:
                serie["cast"] = []
            
        except Exception as e:
            print(f"Erro ao extrair detalhes de {serie['title']}: {e}")
            continue
    
    driver.quit()
    return series_list

In [None]:
def save_full_data(series_data):
    with open("imdb_data/top_250_series_full.json", "w", encoding="utf-8") as f:
        json.dump(series_data, f, indent=4, ensure_ascii=False)

In [None]:
# Executa a coleta de detalhes adicionais
if os.path.exists("imdb_data/top_250_series.json"):
    with open("imdb_data/top_250_series.json", "r", encoding="utf-8") as f:
        series_data = json.load(f)
    
    # Limita a 10 séries para teste (remova para coletar todas)
    detailed_series = scrape_series_details(series_data[:10])
    save_full_data(detailed_series)
    print("Detalhes adicionais coletados e salvos com sucesso!")
else:
    print("Arquivo 'top_250_series.json' não encontrado. Execute primeiro a Tarefa 1.")

---

## Limpeza e Transformação de Dados

In [None]:
def clean_and_transform_data():
    if os.path.exists("imdb_data/top_250_series_full.json"):
        with open("imdb_data/top_250_series_full.json", "r", encoding="utf-8") as f:
            data = json.load(f)
        
        for serie in data:
            # Limpa o título removendo números e caracteres especiais
            serie["title"] = re.sub(r'^\d+\.\s*', '', serie["title"]).strip()
            
            # Converte ano para inteiro
            if isinstance(serie["year"], str):
                serie["year"] = int(serie["year"]) if serie["year"].isdigit() else serie["year"]
            
            # Converte episódios para formato numérico quando possível
            if isinstance(serie["episodes"], str):
                ep_match = re.search(r'(\d+)', serie["episodes"])
                if ep_match:
                    serie["episodes"] = int(ep_match.group(1))
            
            # Garante que a nota seja float
            if isinstance(serie["rating"], str):
                serie["rating"] = float(serie["rating"]) if serie["rating"].replace('.', '').isdigit() else serie["rating"]
        
        # Salva os dados limpos
        with open("imdb_data/top_250_series_clean.json", "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        
        print("Dados limpos e transformados salvos com sucesso!")
    else:
        print("Arquivo 'top_250_series_full.json' não encontrado.")

In [None]:
# Executa a limpeza e transformação dos dados
clean_and_transform_data()