In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL
BASE_URL = "https://books.toscrape.com/catalogue/page-{}.html"

# Data storage
books_data = []

# Loop through all 50 pages
for page in range(1, 51):
    url = BASE_URL.format(page)
    response = requests.get(url)

    # If page does not exist, stop scraping
    if response.status_code != 200:
        break

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all book containers
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        # Title
        title = book.h3.a["title"]

        # Price
        price = book.find("p", class_="price_color").text.strip()

        # Availability
        availability = book.find("p", class_="instock availability").text.strip()

        # Star Rating (class has "star-rating Three", etc.)
        star_class = book.find("p")["class"]
        star_rating = star_class[1] if len(star_class) > 1 else "No Rating"

        books_data.append([title, price, availability, star_rating])

# Convert to DataFrame
df = pd.DataFrame(books_data, columns=["Title", "Price", "Availability", "Star Rating"])

# Save to CSV
df.to_csv("books.csv", index=False)

print("✅ Scraping complete! Data saved to books.csv")


✅ Scraping complete! Data saved to books.csv


In [5]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# --- 1. Setup Selenium with Anti-Detection & Auto-Driver ---
options = Options()
# options.add_argument('--headless')  # Keep commented out to see the browser working
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
options.add_argument("--window-size=1920,1080")

# Automatically download and install the correct Chrome driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

movies_data = []

try:
    # --- 2. Access the Website ---
    url = "https://www.imdb.com/chart/top/"
    print("Accessing IMDB Top 250...")
    driver.get(url)

    # --- 3. Wait for Data to Load ---
    wait = WebDriverWait(driver, 15)
    # Wait until the movie list items are present in the DOM
    movie_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item")))

    print(f"Found {len(movie_elements)} movies. Starting extraction...")

    # --- 4. Extract Data Loop ---
    for movie in movie_elements:
        try:
            # A. Extract Title & Rank
            # The title is inside an h3 tag, e.g., "1. The Shawshank Redemption"
            title_element = movie.find_element(By.CSS_SELECTOR, "h3.ipc-title__text")
            full_text = title_element.text

            # Split Rank and Title
            if ". " in full_text:
                rank, title = full_text.split(". ", 1)
            else:
                rank = "N/A"
                title = full_text

            # B. Extract Metadata (Year, Duration, Rated)
            # The year is usually the first span inside the metadata container
            metadata_items = movie.find_elements(By.CSS_SELECTOR, "div.cli-title-metadata span")
            year = metadata_items[0].text if metadata_items else "N/A"

            # C. Extract Rating
            # The rating is inside a span with a specific class, e.g., "9.3 (2.8M)"
            rating_element = movie.find_element(By.CSS_SELECTOR, "span.ipc-rating-star")
            # We split by space to get just the number "9.3"
            rating = rating_element.text.split()[0]

            # Append to list
            movies_data.append({
                "Rank": rank,
                "Movie Title": title,
                "Year of Release": year,
                "IMDB Rating": rating
            })

            # Stop if we have enough (optional safety check)
            if len(movies_data) >= 250:
                break

        except Exception as e:
            # If a single movie fails, print error but continue to the next one
            # print(f"Skipping a movie due to error: {e}")
            continue

    # --- 5. Save Data to CSV ---
    if movies_data:
        df_imdb = pd.DataFrame(movies_data)
        df_imdb.to_csv('imdb_top250.csv', index=False)
        print(f"Success! {len(df_imdb)} movies saved to 'imdb_top250.csv'.")
    else:
        print("Failed to extract any movie data.")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser
    driver.quit()

Accessing IMDB Top 250...
Found 250 movies. Starting extraction...
Success! 250 movies saved to 'imdb_top250.csv'.


In [8]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

URL = "https://www.timeanddate.com/weather/"
HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}

resp = requests.get(URL, headers=HEADERS, timeout=15)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")

# Pick the first <table> that contains a degree symbol — likely the weather table
table = next((t for t in soup.find_all("table") if "°" in t.get_text()), None)
if table is None:
    raise SystemExit("Could not find the weather table. Page layout may have changed.")

rows = table.find_all("tr")

records = []
for tr in rows:
    text = tr.get_text(" ", strip=True)
    if "°" not in text:           # skip non-weather rows
        continue

    # City: first <a> text in the row (fallback to first cell text)
    a = tr.find("a")
    city = a.get_text(strip=True) if a else (tr.find("td").get_text(strip=True) if tr.find("td") else "")

    # Temperature: find e.g. 24 °C or 24°C
    temp_match = re.search(r"(-?\d+)\s*°\s*([CF])", text, flags=re.I)
    temperature = f"{temp_match.group(1)} °{temp_match.group(2).upper()}" if temp_match else ""

    # Condition: prefer icon alt text, else look for common words
    img = tr.find("img", alt=True)
    if img and img["alt"].strip():
        condition = img["alt"].strip()
    else:
        cond_match = re.search(r"\b(Clear|Cloudy|Sunny|Rain|Showers|Snow|Mist|Fog|Overcast|Thunder|Windy)\b", text, flags=re.I)
        condition = cond_match.group(0) if cond_match else ""

    records.append({"City Name": city, "Temperature": temperature, "Weather Condition": condition})

df = pd.DataFrame(records)
df = df[df["City Name"].astype(bool)]   # drop any rows without a city name

print(f"Found {len(df)} rows. Sample:")
print(df.head(10).to_string(index=False))

df.to_csv("weather.csv", index=False)
print("Saved to weather.csv")


Found 71 rows. Sample:
  City Name Temperature           Weather Condition
      Accra       27 °C       Passing clouds. Warm.
Addis Ababa       10 °C       Passing clouds. Cool.
   Adelaide       18 °C                       Cool.
    Algiers       17 °C       Passing clouds. Mild.
     Almaty        4 °C                Fog. Chilly.
      Amman       11 °C                Clear. Cool.
  Amsterdam        6 °C Passing clouds. Quite cool.
     Anadyr      -24 °C      Sunny. Extremely cold.
  Anchorage       -3 °C      Mostly cloudy. Chilly.
     Ankara        2 °C              Clear. Chilly.
Saved to weather.csv
