# My IMDb Top 250 Movies Scraper Project


# Step 1: Scraping the website and Store Raw Data
This step fetches the raw data and saves into a CSV file

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Send a request to IMDb's Top 250 movies page
url = "https://www.imdb.com/chart/top/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("Successfully fetched the page!")
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all movie entries
    movie_table = soup.find('ul', class_='ipc-metadata-list')  # Parent container
    movies = []
    
    for movie in movie_table.select('li.ipc-metadata-list-summary-item'):
        # Extract title, removing rank number
        title_elem = movie.find('h3', class_='ipc-title__text')
        title = title_elem.get_text(strip=True).split('. ', 1)[1] if title_elem else 'N/A'
        
        # Extract year (first metadata item)
        metadata_items = movie.find_all('span', class_='sc-f30335b4-7 jhjEEd cli-title-metadata-item')
        year = metadata_items[0].get_text(strip=True) if metadata_items else 'N/A'
        
        # Extract rating
        rating_elem = movie.find('span', class_='ipc-rating-star--rating')
        rating = rating_elem.get_text(strip=True) if rating_elem else 'N/A'
        
        # Append to list
        movies.append({
            'title': title,
            'year': year,
            'rating': rating
        })
    
    # Store raw data
    df_raw = pd.DataFrame(movies)
    df_raw.to_csv('imdb_raw_data.csv', index=False)
    print("Raw data saved to 'imdb_raw_data.csv'")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Successfully fetched the page!
Raw data saved to 'imdb_raw_data.csv'


# Step 2: Generate Initial Data Sample
I take the first 5 movies from my raw data to see what I scraped. I print them out and save them to `imdb_raw_sample.csv` so I can check them later.

In [15]:
# Add after storing raw data
sample_raw = df_raw.head(5)
print("Initial Data Sample:")
print(sample_raw)
sample_raw.to_csv('imdb_raw_sample.csv', index=False)
print("Initial sample saved to 'imdb_raw_sample.csv'")

Initial Data Sample:
                      title  year rating
0  The Shawshank Redemption  1994    9.3
1             The Godfather  1972    9.2
2           The Dark Knight  2008    9.0
3     The Godfather Part II  1974    9.0
4              12 Angry Men  1957    9.0
Initial sample saved to 'imdb_raw_sample.csv'


# Step 3: Process the Data
I make a function to fix the data. It turns the `year` into a number (like "1994" to 1994) and the `rating` into a number (like "9.3" to 9.3). If something goes wrong, I use 0 or 0.0. Then, I save it to `imdb_processed_data.csv`.

In [7]:
# Data processing function
def process_data(df):
    # Title is already clean (rank removed during scraping)
    # Convert Year to integer, handle 'N/A'
    df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)
    # Rating is already numeric, but ensure float type
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0.0)
    return df

df_processed = process_data(df_raw.copy())
df_processed.to_csv('imdb_processed_data.csv', index=False)
print("Processed data saved to 'imdb_processed_data.csv'")

Processed data saved to 'imdb_processed_data.csv'


# Step 4: Generate Processed Data Sample
I show the first 5 movies after processing to make sure the years and ratings are numbers now. I save them to `imdb_processed_sample.csv` too.

In [9]:
sample_processed = df_processed.head(5)
print("Processed Data Sample:")
print(sample_processed)
sample_processed.to_csv('imdb_processed_sample.csv', index=False)
print("Processed sample saved to 'imdb_processed_sample.csv'")


Processed Data Sample:
                      title  year  rating
0  The Shawshank Redemption  1994     9.3
1             The Godfather  1972     9.2
2           The Dark Knight  2008     9.0
3     The Godfather Part II  1974     9.0
4              12 Angry Men  1957     9.0
Processed sample saved to 'imdb_processed_sample.csv'


# Step 5: Clean the Data
I clean the data by removing rows with missing titles or ratings, getting rid of duplicates, and keeping only movies with years after 1900. I save this to `imdb_cleaned_data.csv`.

In [11]:
def clean_data(df):
    # Remove rows with missing critical data
    df = df.dropna(subset=['title', 'rating'])
    # Remove duplicates
    df = df.drop_duplicates(subset=['title'])
    # Filter out invalid years
    df = df[df['year'] > 1900]
    return df

df_cleaned = clean_data(df_processed.copy())
df_cleaned.to_csv('imdb_cleaned_data.csv', index=False)
print("Cleaned data saved to 'imdb_cleaned_data.csv'")

Cleaned data saved to 'imdb_cleaned_data.csv'


# Step 6: Prepare Final Dataset
I sort the cleaned data by rating (highest first) to make it like the Top 250 list. I fix the numbering and save it to `imdb_final_data.csv`. Then, I show the top 5 to see the best movies I got.

In [13]:
df_final = df_cleaned.sort_values(by='rating', ascending=False)
df_final.reset_index(drop=True, inplace=True)
df_final.to_csv('imdb_final_data.csv', index=False)
print("Final dataset saved to 'imdb_final_data.csv'")
print("Final Data Preview:")
print(df_final.head())

Final dataset saved to 'imdb_final_data.csv'
Final Data Preview:
                      title  year  rating
0  The Shawshank Redemption  1994     9.3
1             The Godfather  1972     9.2
2           The Dark Knight  2008     9.0
3     The Godfather Part II  1974     9.0
4              12 Angry Men  1957     9.0
