In [81]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [None]:
base_url = 'https://www.themoviedb.org'
base_movie_url = 'https://www.themoviedb.org/movie'
temp_url = '/top-rated?page='
all_page_data = []

# File name for saving data
output_file = 'movies_data.csv'

# Check if the file exists
try:
    # If file exists, load existing data
    existing_data = pd.read_csv(output_file)
    all_page_data = existing_data.to_dict('records')  # Convert existing rows to a list of dictionaries
except FileNotFoundError:
    all_page_data = []

# Start from the last completed page + 1
start_page = len(all_page_data) // 20 + 1 if all_page_data else 1

for num in range(start_page, 501):  # pages
    resp1 = requests.get(base_movie_url + temp_url + str(num)).text
    soup_data = BeautifulSoup(resp1, 'lxml')
    all_div = soup_data.find_all('div', class_='card style_1')

    page_data = []  # To store data for current page

    for items in all_div:
        inner_div = items.find('div', class_='content')
        inner_link = inner_div.find('a')['href']
        full_link = base_url + inner_link

        inner_data_req = requests.get(full_link).text
        new_soup_data = BeautifulSoup(inner_data_req, 'lxml')

        movie_name = inner_div.find('h2').text.strip()
        movie_date = inner_div.find('p').text.strip()

        rating_div = new_soup_data.find('div', 'user_score_chart')
        if rating_div:
            rating = rating_div["data-percent"]
        else:
            rating = 'N/A'

        genre_list = []  # Default empty list if no genres found
        genre_span = new_soup_data.find('span', class_='genres')
        if genre_span:
            genre_links = genre_span.find_all('a')
            genre_list = [link.text.strip() for link in genre_links if link.text.strip()]

        run_time_find = new_soup_data.find('span', class_='runtime')
        if run_time_find:
            run_time = run_time_find.text.strip()
        else:
            run_time = "N/A"

        # Extract certification
        certification_span = new_soup_data.find('span', class_='certification')
        certification = certification_span.get_text(strip=True) if certification_span else 'N/A'

        # Extract overview
        ovr_view = new_soup_data.find('div', class_='overview')
        overview = ovr_view.find('p').text

        # Extract tagline
        tagline_h3 = new_soup_data.find('h3', class_='tagline')
        tagline = tagline_h3.get_text(strip=True) if tagline_h3 else 'N/A'

        # Locate the facts section directly
        facts_section = new_soup_data.find('section', class_='facts left_column')

        # Initialize variables for language, budget, and revenue
        language = 'N/A'
        budget = 'N/A'
        revenue = 'N/A'

        # Extract language, budget, and revenue
        if facts_section:
            # Extract Original Language
            language_bdi = facts_section.find('bdi', string='Original Language')
            if language_bdi:
                language_p = language_bdi.find_parent('p')
                if language_p:
                    language = language_p.get_text(strip=True).replace('Original Language', '').strip()

            # Extract Budget
            budget_bdi = facts_section.find('bdi', string='Budget')
            if budget_bdi:
                budget_p = budget_bdi.find_parent('p')
                if budget_p:
                    budget = budget_p.get_text(strip=True).replace('Budget', '').strip()

            # Extract Revenue
            revenue_bdi = facts_section.find('bdi', string='Revenue')
            if revenue_bdi:
                revenue_p = revenue_bdi.find_parent('p')
                if revenue_p:
                    revenue = revenue_p.get_text(strip=True).replace('Revenue', '').strip()

        # Extract Directors
        ol_profile = new_soup_data.find('ol', class_='people no_image')
        li_profile = ol_profile.find_all('li', class_='profile')
        director = set()
        for i in range(0, len(li_profile)):
            if 'Director' in li_profile[i].find('p', class_="character").text:
                direct = li_profile[i].find('a').text
                director.add(direct)
        directors = list(director)  # Convert set to list

        # Extract Cast
        cast_list = []
        cast_section = new_soup_data.find('ol', class_='people scroller')
        if cast_section:
            cast_cards = cast_section.find_all('li', class_='card')
            for card in cast_cards:
                actor_p = card.find('p')
                if actor_p:
                    actor_a = actor_p.find('a')
                    if actor_a:
                        cast_list.append(actor_a.get_text(strip=True))

        # Extract keywords
        keyword_rounded = []
        keyword_bold = []
        keywords_section = new_soup_data.find('section', class_='keywords right_column')
        if keywords_section:
            ul_tag = keywords_section.find('ul')
            if ul_tag:
                for li in ul_tag.find_all('li'):
                    a_tag = li.find('a')
                    if a_tag:
                        if 'rounded' in a_tag.get('class', []):
                            keyword_rounded.append(a_tag.get_text(strip=True))
                        elif '!border' in a_tag.get('class', []):
                            keyword_bold.append(a_tag.get_text(strip=True))



        # Extract reviews
        reviews_list = []

        # Find the "Read All Reviews" link inside div with class="inner_content"
        inner_content_div = new_soup_data.find('div', class_='inner_content')
        if inner_content_div:
            reviews_link = inner_content_div.find('p', class_='new_button')
            if reviews_link:
                base_reviews_url = base_url + reviews_link.find('a')['href']
                page_num = 1

                while True:
                    # Construct the URL for the current page
                    reviews_url = f"{base_reviews_url}?page={page_num}"
                    reviews_page = requests.get(reviews_url).text
                    reviews_soup = BeautifulSoup(reviews_page, 'lxml')

                    # Find all review containers on the current page
                    review_containers = reviews_soup.find_all('div', class_='review_container')
                    if not review_containers:
                        # No more reviews, exit the loop
                        break

                    for container in review_containers:
                        # Find all divs with class="content" inside the review container
                        review_contents = container.find_all('div', class_='content')
                        for review_content in review_contents:
                            # Extract writer
                            writer_h5 = review_content.find('h5')
                            writer = writer_h5.find('a').text if writer_h5 and writer_h5.find('a') else 'N/A'

                            # Extract rating
                            rating_div = review_content.find('div', class_='rating_border rating')
                            score = rating_div.text.strip() if rating_div else 'N/A'

                            # Extract teaser
                            teaser_div = review_content.find('div', class_='teaser')
                            if teaser_div:
                                # Find the "read the rest" link anywhere inside the teaser div
                                read_more_link = teaser_div.find('a', class_='underline')
                                if read_more_link:
                                    # Navigate to the full review page
                                    full_review_url = base_url + read_more_link['href']
                                    full_review_page = requests.get(full_review_url).text
                                    full_review_soup = BeautifulSoup(full_review_page, 'lxml')
                                    full_review_div = full_review_soup.find('div', class_='content column pad')
                                    if full_review_div:
                                        # Get all text from all <p> tags
                                        all_paragraphs = full_review_div.find_all('p')
                                        review_text = ' '.join(p.get_text(strip=True) for p in all_paragraphs)
                                    else:
                                        review_text = teaser_div.get_text(strip=True)
                                else:
                                    # If no "read the rest" link, get all text inside the teaser div
                                    review_text = teaser_div.get_text(strip=True)

                            # Extract writer's most watched genres
                            most_watched_genres = []
                            if writer != 'N/A' and writer_h5.find('a'):
                                writer_link = base_url + writer_h5.find('a')['href']
                                try:
                                    writer_page = requests.get(writer_link).text
                                    writer_soup = BeautifulSoup(writer_page, 'lxml')
                                    
                                    # Find the script containing genreData
                                    for script in writer_soup.find_all('script'):
                                        if 'var genreData' in script.text:
                                            # Get the line containing genreData
                                            for line in script.text.split(';'):
                                                if 'var genreData' in line:
                                                    # Extract the JSON part
                                                    json_str = line.split('=', 1)[1].strip()
                                                    genre_data = json.loads(json_str)
                                                    # Convert to list of tuples -> (genre, count of the reviewed movies/series)
                                                    most_watched_genres = [(item['name'], item['count']) for item in genre_data]
                                                    break
                                    
                                except Exception as e:
                                    print(f"Error getting genres for {writer}: {e}")
                                    most_watched_genres = []

                            # Add review to the list
                            reviews_list.append({
                                'writer': writer,
                                'score': score,
                                'review': review_text,
                                'most_watched_genres': most_watched_genres
                            })

                    # Increment the review page number for the next iteration
                    page_num += 1


        # Extract content_score and content_score_description
        content_score_div = new_soup_data.find('div', class_='content_score')
        content_score_wrapper_div = new_soup_data.find('div', class_='content_score_wrapper')

        if content_score_div:
            content_score = content_score_div.find('p').text.strip() if content_score_div.find('p') else 'N/A'
        else:
            content_score = 'N/A'

        if content_score_wrapper_div:
            content_score_description = content_score_wrapper_div.find('p', attrs={'dir': 'auto'}).text.strip() if content_score_wrapper_div.find('p', attrs={'dir': 'auto'}) else 'N/A'
        else:
            content_score_description = 'N/A'


        # Add all data to the dictionary
        movie_data = {
            'movie_name': movie_name,
            'release_date': movie_date,
            'rating': rating,
            'genre': genre_list, 
            'run_time': run_time,
            'certification': certification,
            'overview': overview,
            'tagline': tagline,
            'director': directors,  
            'language': language,
            'budget': budget,
            'revenue': revenue,
            'normal_keyword_(rounded)': keyword_rounded,
            'tone_keyword_(bold)': keyword_bold,  
            'cast': cast_list,
            'reviews':  reviews_list,
            'content_score': content_score,
            'content_score_description': content_score_description 
        }

        page_data.append(movie_data)

    # Add current page data to all data
    all_page_data.extend(page_data)

    # Save after each page
    df = pd.DataFrame(all_page_data)
    df.to_csv(output_file, index=False)
    print(f"Saved data up to page {num}")        


Saved data up to page 41
Saved data up to page 42
Saved data up to page 43
Saved data up to page 44
Saved data up to page 45
Saved data up to page 46
Saved data up to page 47
Saved data up to page 48
Saved data up to page 49
Saved data up to page 50
Saved data up to page 51
Saved data up to page 52
Saved data up to page 53
Saved data up to page 54
Saved data up to page 55
Saved data up to page 56
Saved data up to page 57
Saved data up to page 58
Saved data up to page 59
Saved data up to page 60
Saved data up to page 61
Saved data up to page 62
Saved data up to page 63
Saved data up to page 64
Saved data up to page 65
Saved data up to page 66
Saved data up to page 67
Saved data up to page 68
Saved data up to page 69
Saved data up to page 70
Saved data up to page 71
Saved data up to page 72
Saved data up to page 73
Saved data up to page 74
Saved data up to page 75
Saved data up to page 76
Saved data up to page 77
Saved data up to page 78
Saved data up to page 79
Saved data up to page 80


AttributeError: 'NoneType' object has no attribute 'find_all'

In [127]:
df.tail()

Unnamed: 0,movie_name,release_date,rating,genre,run_time,certification,overview,tagline,director,language,budget,revenue,normal_keyword_(rounded),tone_keyword_(bold),cast,reviews,content_score,content_score_description
1635,Contact,26 Sep 1997,74,"[Drama, Science Fiction, Mystery]",2h 30m,PG,A radio astronomer receives the first extrater...,Take a journey to the heart of the universe.,[Robert Zemeckis],English,"$90,000,000.00","$171,120,329.00","[based on novel or book, nasa, new mexico, ext...","[philosophical, wonder, introspective, inspira...","[Jodie Foster, Matthew McConaughey, James Wood...","[{'writer': 'talisencrw', 'score': '90%', 'rev...",100,Yes! Looking good!
1636,"Crouching Tiger, Hidden Dragon",05 Jan 2001,74,"[Adventure, Drama, Action, Romance]",2h,12,Two warriors in pursuit of a stolen sword and ...,"A timeless story of strength, secrets and two ...",[Ang Lee],Chinese,"$17,000,000.00","$213,978,518.00","[flying, martial arts, kung fu, based on novel...","[aggressive, vibrant]","[Chow Yun-Fat, Michelle Yeoh, Zhang Ziyi, Chan...","[{'writer': 'talisencrw', 'score': '100%', 're...",100,Yes! Looking good!
1637,The Swimmers,09 Oct 2022,74,"[Drama, History]",2h 14m,PG-13,"From war-torn Syria to the 2016 Rio Olympics, ...",,[Sally El Hosaini],English,-,-,"[refugee, olympian sports team, biography, bas...","[admiring, comforting, forceful]","[Manal Issa, Nathalie Issa, Matthias Schweighö...","[{'writer': 'CinemaSerf', 'score': '70%', 'rev...",100,Yes! Looking good!
1638,18 Presents,02 Jan 2020,74,"[Drama, Family]",1h 50m,T,Elisa is only forty when an incurable disease ...,,[Francesco Amato],Italian,"$2,500,000.00","$3,486,375.00",[],[],"[Vittoria Puccini, Benedetta Porcaroli, Edoard...",[],100,Yes! Looking good!
1639,Predestination,20 Feb 2015,74,"[Science Fiction, Thriller]",1h 38m,15,Predestination chronicles the life of a Tempor...,To save the future he must reshape the past.,"[Peter Spierig, Michael Spierig]",English,"$5,500,000.00","$4,942,449.00","[bomber, secret organization, bartender, pregn...","[shocking, philosophical]","[Ethan Hawke, Sarah Snook, Noah Taylor, Christ...","[{'writer': 'Frank Ochieng', 'score': 'N/A', '...",100,Yes! Looking good!


In [128]:
df['reviews'][1630]

[{'writer': 'Manuel São Bento',
  'score': '80%',
  'review': 'FULL SPOILER-FREE REVIEW @ https://www.msbreviews.com/movie-reviews/the-last-duel-spoiler-free-review "The Last Duel became one of my favorite Ridley Scott films, boasting a commanding Jodie Comer who delivers one of the year\'s most emotionally powerful performances. Adam Driver, Matt Damon, and Ben Affleck all offer remarkable interpretations, but the actress fully embodies Marguerite de Carrouges\' courage amid so much pain and suffering in a theme-heavy, brutally shocking true story. The perspective-based narrative structure is interesting and efficient enough to overcome its inevitable repeatability issues. Holding technical attributes that will surely get recognition in the awards season - especially Harry Gregson-Williams’ score - the actual duel is one of the most nerve-wracking sequences of the last few years, compensating the audience\'s patience with a satisfying climax. Watch it on the big screen, if possible." 

# Some things!!