## Scrape imdb website to find the top 250 movies

In [17]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json

def scrape_imdb_top_movies(url):
    response = requests.get(url, headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"})
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    script_tag = soup.find('script', type='application/ld+json')
    
    # List to hold movie data
    movie_list = []

    # Load JSON data
    json_data = json.loads(script_tag.string)
        
    movies = json_data['itemListElement']
    for movie in movies:
        name = movie['item']['name']
        genre = movie['item'].get('genre', 'N/A')  # Use .get() in case the field is missing
        description = movie['item'].get('description', 'N/A')
        duration = movie['item'].get('duration', 'N/A')
        
        # Append to list
        movie_list.append([name, genre, description, duration])

    # Convert list to a pandas DataFrame
    return pd.DataFrame(movie_list, columns=['Name', 'Genre', 'Description', 'Duration'])

url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'

movie_df = scrape_imdb_top_movies(url)
movie_df.head()

Unnamed: 0,Name,Genre,Description,Duration
0,The Shawshank Redemption,Drama,A banker convicted of uxoricide forms a friend...,PT2H22M
1,The Godfather,"Crime, Drama",The aging patriarch of an organized crime dyna...,PT2H55M
2,The Dark Knight,"Action, Crime, Drama",When a menace known as the Joker wreaks havoc ...,PT2H32M
3,The Godfather Part II,"Crime, Drama",The early life and career of Vito Corleone in ...,PT3H22M
4,12 Angry Men,"Crime, Drama",The jury in a New York City murder trial is fr...,PT1H36M


### Generate the CSV based on the scrapped results

In [18]:
import os

filename = 'movies_scraped.csv'

outdir = '../data'
if not os.path.exists(outdir):
    os.mkdir(outdir)

fullname = os.path.join(outdir, filename)    
movie_df.to_csv(fullname, sep='|', encoding='utf-8-sig')