In [81]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [None]:
base_url = 'https://www.themoviedb.org'
base_movie_url = 'https://www.themoviedb.org/movie'
temp_url = '/top-rated?page='
all_page_data = []

# File name for saving data
output_file = 'movies_data.csv'

# Check if the file exists
try:
    # If file exists, load existing data
    existing_data = pd.read_csv(output_file)
    all_page_data = existing_data.to_dict('records')  # Convert existing rows to a list of dictionaries
except FileNotFoundError:
    all_page_data = []

# Start from the last completed page + 1
start_page = len(all_page_data) // 20 + 1 if all_page_data else 1

for num in range(start_page, 2):  # pages
    resp1 = requests.get(base_movie_url + temp_url + str(num)).text
    soup_data = BeautifulSoup(resp1, 'lxml')
    all_div = soup_data.find_all('div', class_='card style_1')

    page_data = []  # To store data for current page

    for items in all_div:
        inner_div = items.find('div', class_='content')
        inner_link = inner_div.find('a')['href']
        full_link = base_url + inner_link

        inner_data_req = requests.get(full_link).text
        new_soup_data = BeautifulSoup(inner_data_req, 'lxml')

        movie_name = inner_div.find('h2').text.strip()
        movie_date = inner_div.find('p').text.strip()

        rating_div = new_soup_data.find('div', 'user_score_chart')
        if rating_div:
            rating = rating_div["data-percent"]
        else:
            rating = 'N/A'

        genre_class = new_soup_data.find('span', class_='genres')
        genre = genre_class.find_all('a')
        genre_list = [i.text for i in genre]  # Store as a list

        run_time_find = new_soup_data.find('span', class_='runtime')
        if run_time_find:
            run_time = run_time_find.text.strip()
        else:
            run_time = "N/A"

        # Extract certification
        certification_span = new_soup_data.find('span', class_='certification')
        certification = certification_span.get_text(strip=True) if certification_span else 'N/A'

        # Extract overview
        ovr_view = new_soup_data.find('div', class_='overview')
        overview = ovr_view.find('p').text

        # Extract tagline
        tagline_h3 = new_soup_data.find('h3', class_='tagline')
        tagline = tagline_h3.get_text(strip=True) if tagline_h3 else 'N/A'

        # Locate the facts section directly
        facts_section = new_soup_data.find('section', class_='facts left_column')

        # Initialize variables for language, budget, and revenue
        language = 'N/A'
        budget = 'N/A'
        revenue = 'N/A'

        # Extract language, budget, and revenue
        if facts_section:
            # Extract Original Language
            language_bdi = facts_section.find('bdi', string='Original Language')
            if language_bdi:
                language_p = language_bdi.find_parent('p')
                if language_p:
                    language = language_p.get_text(strip=True).replace('Original Language', '').strip()

            # Extract Budget
            budget_bdi = facts_section.find('bdi', string='Budget')
            if budget_bdi:
                budget_p = budget_bdi.find_parent('p')
                if budget_p:
                    budget = budget_p.get_text(strip=True).replace('Budget', '').strip()

            # Extract Revenue
            revenue_bdi = facts_section.find('bdi', string='Revenue')
            if revenue_bdi:
                revenue_p = revenue_bdi.find_parent('p')
                if revenue_p:
                    revenue = revenue_p.get_text(strip=True).replace('Revenue', '').strip()

        # Extract Directors
        ol_profile = new_soup_data.find('ol', class_='people no_image')
        li_profile = ol_profile.find_all('li', class_='profile')
        director = set()
        for i in range(0, len(li_profile)):
            if 'Director' in li_profile[i].find('p', class_="character").text:
                direct = li_profile[i].find('a').text
                director.add(direct)
        directors = list(director)  # Convert set to list

        # Extract Cast
        cast_list = []
        cast_section = new_soup_data.find('ol', class_='people scroller')
        if cast_section:
            cast_cards = cast_section.find_all('li', class_='card')
            for card in cast_cards:
                actor_p = card.find('p')
                if actor_p:
                    actor_a = actor_p.find('a')
                    if actor_a:
                        cast_list.append(actor_a.get_text(strip=True))

        # Extract keywords
        keyword_rounded = []
        keyword_bold = []
        keywords_section = new_soup_data.find('section', class_='keywords right_column')
        if keywords_section:
            ul_tag = keywords_section.find('ul')
            if ul_tag:
                for li in ul_tag.find_all('li'):
                    a_tag = li.find('a')
                    if a_tag:
                        if 'rounded' in a_tag.get('class', []):
                            keyword_rounded.append(a_tag.get_text(strip=True))
                        elif '!border' in a_tag.get('class', []):
                            keyword_bold.append(a_tag.get_text(strip=True))



        # Extract reviews
        reviews_list = []

        # Find the "Read All Reviews" link inside div with class="inner_content"
        inner_content_div = new_soup_data.find('div', class_='inner_content')
        if inner_content_div:
            reviews_link = inner_content_div.find('p', class_='new_button')
            if reviews_link:
                base_reviews_url = base_url + reviews_link.find('a')['href']
                page_num = 1

                while True:
                    # Construct the URL for the current page
                    reviews_url = f"{base_reviews_url}?page={page_num}"
                    reviews_page = requests.get(reviews_url).text
                    reviews_soup = BeautifulSoup(reviews_page, 'lxml')

                    # Find all review containers on the current page
                    review_containers = reviews_soup.find_all('div', class_='review_container')
                    if not review_containers:
                        # No more reviews, exit the loop
                        break

                    for container in review_containers:
                        # Find all divs with class="content" inside the review container
                        review_contents = container.find_all('div', class_='content')
                        for review_content in review_contents:
                            # Extract writer
                            writer_h5 = review_content.find('h5')
                            writer = writer_h5.find('a').text if writer_h5 and writer_h5.find('a') else 'N/A'

                            # Extract rating
                            rating_div = review_content.find('div', class_='rating_border rating')
                            score = rating_div.text.strip() if rating_div else 'N/A'

                            # Extract teaser
                            teaser_div = review_content.find('div', class_='teaser')
                            if teaser_div:
                                # Find the "read the rest" link anywhere inside the teaser div
                                read_more_link = teaser_div.find('a', class_='underline')
                                if read_more_link:
                                    # Navigate to the full review page
                                    full_review_url = base_url + read_more_link['href']
                                    full_review_page = requests.get(full_review_url).text
                                    full_review_soup = BeautifulSoup(full_review_page, 'lxml')
                                    full_review_div = full_review_soup.find('div', class_='content column pad')
                                    if full_review_div:
                                        # Get all text from all <p> tags
                                        all_paragraphs = full_review_div.find_all('p')
                                        review_text = ' '.join(p.get_text(strip=True) for p in all_paragraphs)
                                    else:
                                        review_text = teaser_div.get_text(strip=True)
                                else:
                                    # If no "read the rest" link, get all text inside the teaser div
                                    review_text = teaser_div.get_text(strip=True)

                            # Extract writer's most watched genres
                            most_watched_genres = []
                            if writer != 'N/A' and writer_h5.find('a'):
                                writer_link = base_url + writer_h5.find('a')['href']
                                try:
                                    writer_page = requests.get(writer_link).text
                                    writer_soup = BeautifulSoup(writer_page, 'lxml')
                                    
                                    # Find the script containing genreData
                                    for script in writer_soup.find_all('script'):
                                        if 'var genreData' in script.text:
                                            # Get the line containing genreData
                                            for line in script.text.split(';'):
                                                if 'var genreData' in line:
                                                    # Extract the JSON part
                                                    json_str = line.split('=', 1)[1].strip()
                                                    genre_data = json.loads(json_str)
                                                    # Convert to list of tuples -> (genre, count of the reviewed movies/series)
                                                    most_watched_genres = [(item['name'], item['count']) for item in genre_data]
                                                    break
                                    
                                except Exception as e:
                                    print(f"Error getting genres for {writer}: {e}")
                                    most_watched_genres = []

                            # Add review to the list
                            reviews_list.append({
                                'writer': writer,
                                'score': score,
                                'review': review_text,
                                'most_watched_genres': most_watched_genres
                            })

                    # Increment the review page number for the next iteration
                    page_num += 1


        # Extract content_score and content_score_description
        content_score_div = new_soup_data.find('div', class_='content_score')
        content_score_wrapper_div = new_soup_data.find('div', class_='content_score_wrapper')

        if content_score_div:
            content_score = content_score_div.find('p').text.strip() if content_score_div.find('p') else 'N/A'
        else:
            content_score = 'N/A'

        if content_score_wrapper_div:
            content_score_description = content_score_wrapper_div.find('p', attrs={'dir': 'auto'}).text.strip() if content_score_wrapper_div.find('p', attrs={'dir': 'auto'}) else 'N/A'
        else:
            content_score_description = 'N/A'


        # Add all data to the dictionary
        movie_data = {
            'movie_name': movie_name,
            'release_date': movie_date,
            'rating': rating,
            'genre': genre_list, 
            'run_time': run_time,
            'certification': certification,
            'overview': overview,
            'tagline': tagline,
            'director': directors,  
            'language': language,
            'budget': budget,
            'revenue': revenue,
            'normal_keyword_(rounded)': keyword_rounded,
            'tone_keyword_(bold)': keyword_bold,  
            'cast': cast_list,
            'reviews':  reviews_list,
            'content_score': content_score,
            'content_score_description': content_score_description 
        }

        page_data.append(movie_data)

    # Add current page data to all data
    all_page_data.extend(page_data)

    # Save after each page
    df = pd.DataFrame(all_page_data)
    df.to_csv(output_file, index=False)
    print(f"Saved data up to page {num}")        


  genre = genre_class.findAll('a')
  li_profile = ol_profile.findAll('li', class_='profile')


Saved data up to page 2


In [92]:
df.tail()

Unnamed: 0,release_date,rating,genre,run_time,certification,overview,tagline,director,language,budget,revenue,normal_keyword_(rounded),tone_keyword_(bold),cast,reviews,content_score,content_score_description,movie_name
35,15 Nov 2018,84,"[Music, Documentary]",1h 25m,ALL,Burn the Stage: the Movie is the first movie f...,"From the deserts to the seas, we were always t...",[Park Jun-soo],Korean,-,-,"[k-pop, idol group]",[],"[RM, Jin, Suga, j-hope, Jimin, V, Jung Kook]",[],94,Almost there...,Burn the Stage: The Movie
36,18 Dec 2002,84,"[Adventure, Fantasy, Action]",2h 59m,12,Frodo Baggins and the other members of the Fel...,The fellowship is broken. The power of darknes...,[Peter Jackson],English,"$79,000,000.00","$926,287,400.00","[elves, dwarf, orcs, based on novel or book, o...",[complex],"[Elijah Wood, Ian McKellen, Viggo Mortensen, S...","[{'writer': 'NeoBrowser', 'score': '100%', 're...",100,Yes! Looking good!,The Lord of the Rings: The Two Towers
37,23 Sep 2005,84,"[Fantasy, Animation, Adventure]",1h 59m,U,"Sophie, a young milliner, is turned into an el...",The two lived there.,[Hayao Miyazaki],Japanese,"$24,000,000.00","$236,049,757.00","[flying, witch, rain, castle, scarecrow, body ...",[],"[Chieko Baisho, Takuya Kimura, Akihiro Miwa, T...","[{'writer': 'Andres Gomez', 'score': '90%', 'r...",100,Yes! Looking good!,Howl's Moving Castle
38,14 Dec 2018,84,"[Animation, Action, Adventure, Science Fiction]",1h 57m,PG,Struggling to find his place in the world whil...,More than one wears the mask.,"[Bob Persichetti, Peter Ramsey, Rodney Rothman]",English,"$90,000,000.00","$384,298,736.00","[superhero, based on comic, aftercreditsstinge...",[],"[Shameik Moore, Jake Johnson, Hailee Steinfeld...","[{'writer': 'trineo03', 'score': 'N/A', 'revie...",100,Yes! Looking good!,Spider-Man: Into the Spider-Verse
39,21 May 1980,84,"[Adventure, Action, Science Fiction]",2h 4m,U,"The epic saga continues as Luke Skywalker, in ...",The Star Wars saga continues.,[Irvin Kershner],English,"$18,000,000.00","$538,400,000.00","[rebel, android, asteroid, spacecraft, space b...","[nostalgic, excited]","[Mark Hamill, Harrison Ford, Carrie Fisher, Bi...","[{'writer': 'Ian Beale', 'score': '40%', 'revi...",100,Yes! Looking good!,The Empire Strikes Back


In [85]:
df['reviews'][0]

[{'writer': 'elshaarawy',
  'score': '90%',
  'review': 'very good movie 9.5/10 محمد الشعراوى',
  'most_watched_genres': [('Drama', 1), ('Crime', 1), ('Other', None)]},
 {'writer': 'John Chard',
  'score': '100%',
  'review': 'Some birds aren\'t meant to be caged. The Shawshank Redemption is written and directed by Frank Darabont. It is an adaptation of the Stephen King novella Rita Hayworth and Shawshank Redemption. Starring Tim Robbins and Morgan Freeman, the film portrays the story of Andy Dufresne (Robbins), a banker who is sentenced to two life sentences at Shawshank State Prison for apparently murdering his wife and her lover. Andy finds it tough going but finds solace in the friendship he forms with fellow inmate Ellis "Red" Redding (Freeman). While things start to pick up when the warden finds Andy a prison job more befitting his talents as a banker. However, the arrival of another inmate is going to vastly change things for all of them. There was no fanfare or bunting put out 

# Some things!!