In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [None]:
base_url = 'https://www.themoviedb.org'
base_movie_url = 'https://www.themoviedb.org/movie'
temp_url = '/top-rated?page='
all_page_data = []

output_file = 'from_293_till_350.csv'


try:
    existing_data = pd.read_csv(output_file)
    all_page_data = existing_data.to_dict('records')
except FileNotFoundError:
    all_page_data = []

# Start from the last completed page + 1
start_page = len(all_page_data) // 20 + 293 if all_page_data else 293

for num in range(start_page, 351): 
    resp1 = requests.get(base_movie_url + temp_url + str(num)).text
    soup_data = BeautifulSoup(resp1, 'lxml')
    all_div = soup_data.find_all('div', class_='card style_1')

    page_data = [] 

    for items in all_div:
        inner_div = items.find('div', class_='content')
        inner_link = inner_div.find('a')['href']
        full_link = base_url + inner_link

        inner_data_req = requests.get(full_link).text
        new_soup_data = BeautifulSoup(inner_data_req, 'lxml')

        movie_name = inner_div.find('h2').text.strip() if inner_div.find('h2') else 'N/A'
        movie_date = inner_div.find('p').text.strip() if inner_div.find('p') else 'N/A'

        rating_div = new_soup_data.find('div', 'user_score_chart')
        if rating_div:
            rating = rating_div["data-percent"]
        else:
            rating = 'N/A'

        genre_list = []
        genre_span = new_soup_data.find('span', class_='genres')
        if genre_span:
            genre_links = genre_span.find_all('a')
            genre_list = [link.text.strip() for link in genre_links if link.text.strip()]

        run_time_find = new_soup_data.find('span', class_='runtime')
        if run_time_find:
            run_time = run_time_find.text.strip()
        else:
            run_time = "N/A"

        # Extract certification
        certification_span = new_soup_data.find('span', class_='certification')
        certification = certification_span.get_text(strip=True) if certification_span else 'N/A'

        # Extract overview
        ovr_view = new_soup_data.find('div', class_='overview')
        if ovr_view and ovr_view.find('p'): 
            overview = ovr_view.find('p').text.strip()
        else:
            overview = 'N/A'

        # Extract tagline
        tagline_h3 = new_soup_data.find('h3', class_='tagline')
        tagline = tagline_h3.get_text(strip=True) if tagline_h3 else 'N/A'


        facts_section = new_soup_data.find('section', class_='facts left_column')

        language = 'N/A'
        budget = 'N/A'
        revenue = 'N/A'

        # Extract language, budget, and revenue
        if facts_section:
            # Extract Original Language
            language_bdi = facts_section.find('bdi', string='Original Language')
            if language_bdi:
                language_p = language_bdi.find_parent('p')
                if language_p:
                    language = language_p.get_text(strip=True).replace('Original Language', '').strip()

            # Extract Budget
            budget_bdi = facts_section.find('bdi', string='Budget')
            if budget_bdi:
                budget_p = budget_bdi.find_parent('p')
                if budget_p:
                    budget = budget_p.get_text(strip=True).replace('Budget', '').strip()

            # Extract Revenue
            revenue_bdi = facts_section.find('bdi', string='Revenue')
            if revenue_bdi:
                revenue_p = revenue_bdi.find_parent('p')
                if revenue_p:
                    revenue = revenue_p.get_text(strip=True).replace('Revenue', '').strip()


        # Extract Directors
        ol_profile = new_soup_data.find('ol', class_='people no_image')
        if ol_profile:
            li_profile = ol_profile.find_all('li', class_='profile')
            director_set = set()
            for li in li_profile:
                character_p = li.find('p', class_="character")
                if character_p and 'Director' in character_p.text:
                    director_a = li.find('a')
                    if director_a:
                        director_set.add(director_a.text.strip())
            
            directors = list(director_set)
       

        # Extract Cast
        cast_list = []
        cast_section = new_soup_data.find('ol', class_='people scroller')
        if cast_section:
            cast_cards = cast_section.find_all('li', class_='card')
            for card in cast_cards:
                actor_p = card.find('p')
                if actor_p:
                    actor_a = actor_p.find('a')
                    if actor_a:
                        cast_list.append(actor_a.get_text(strip=True))

        # Extract keywords
        keyword_rounded = []
        keyword_bold = []
        keywords_section = new_soup_data.find('section', class_='keywords right_column')
        if keywords_section:
            ul_tag = keywords_section.find('ul')
            if ul_tag:
                for li in ul_tag.find_all('li'):
                    a_tag = li.find('a')
                    if a_tag:
                        if 'rounded' in a_tag.get('class', []):
                            keyword_rounded.append(a_tag.get_text(strip=True))
                        elif '!border' in a_tag.get('class', []):
                            keyword_bold.append(a_tag.get_text(strip=True))



        # Extract reviews
        reviews_list = []
        inner_content_div = new_soup_data.find('div', class_='inner_content')
        if inner_content_div:
            reviews_link = inner_content_div.find('p', class_='new_button')
            if reviews_link:
                base_reviews_url = base_url + reviews_link.find('a')['href']
                page_num = 1

                while True:
                    reviews_url = f"{base_reviews_url}?page={page_num}"
                    reviews_page = requests.get(reviews_url).text
                    reviews_soup = BeautifulSoup(reviews_page, 'lxml')

                    review_containers = reviews_soup.find_all('div', class_='review_container')
                    if not review_containers:
                        break

                    for container in review_containers:
                        review_contents = container.find_all('div', class_='content')
                        for review_content in review_contents:

                            # Extract writer
                            writer_h5 = review_content.find('h5')
                            writer = writer_h5.find('a').text if writer_h5 and writer_h5.find('a') else 'N/A'

                            # Extract rating
                            rating_div = review_content.find('div', class_='rating_border rating')
                            score = rating_div.text.strip() if rating_div else 'N/A'

                            # Extract review text
                            teaser_div = review_content.find('div', class_='teaser')
                            if teaser_div:
                                read_more_link = teaser_div.find('a', class_='underline')
                                if read_more_link:
                                    full_review_url = base_url + read_more_link['href']
                                    full_review_page = requests.get(full_review_url).text
                                    full_review_soup = BeautifulSoup(full_review_page, 'lxml')
                                    full_review_div = full_review_soup.find('div', class_='content column pad')
                                    if full_review_div:
                                        all_paragraphs = full_review_div.find_all('p')
                                        review_text = ' '.join(p.get_text(strip=True) for p in all_paragraphs)
                                    else:
                                        review_text = teaser_div.get_text(strip=True)
                                else:
                                    review_text = teaser_div.get_text(strip=True)

                            # Extract writer's most watched genres
                            most_watched_genres = []
                            if writer != 'N/A' and writer_h5.find('a'):
                                writer_link = base_url + writer_h5.find('a')['href']
                                try:
                                    writer_page = requests.get(writer_link).text
                                    writer_soup = BeautifulSoup(writer_page, 'lxml')
                                    
                                    for script in writer_soup.find_all('script'):
                                        if 'var genreData' in script.text:
                                            for line in script.text.split(';'):
                                                if 'var genreData' in line:
                                                    json_str = line.split('=', 1)[1].strip()
                                                    genre_data = json.loads(json_str)
                                                    # Convert to list of tuples -> (genre, count of the reviewed movies/series)
                                                    most_watched_genres = [(item['name'], item['count']) for item in genre_data]
                                                    break
                                    
                                except Exception as e:
                                    print(f"Error getting genres for {writer}: {e}")
                                    most_watched_genres = []

                            reviews_list.append({
                                'writer': writer,
                                'score': score,
                                'review': review_text,
                                'most_watched_genres': most_watched_genres
                            })

                    page_num += 1


        # Extract content_score and content_score_description
        content_score_div = new_soup_data.find('div', class_='content_score')
        content_score_wrapper_div = new_soup_data.find('div', class_='content_score_wrapper')

        if content_score_div:
            content_score = content_score_div.find('p').text.strip() if content_score_div.find('p') else 'N/A'
        else:
            content_score = 'N/A'

        if content_score_wrapper_div:
            content_score_description = content_score_wrapper_div.find('p', attrs={'dir': 'auto'}).text.strip() if content_score_wrapper_div.find('p', attrs={'dir': 'auto'}) else 'N/A'
        else:
            content_score_description = 'N/A'


        movie_data = {
            'movie_name': movie_name,
            'release_date': movie_date,
            'rating': rating,
            'genre': genre_list, 
            'run_time': run_time,
            'certification': certification,
            'overview': overview,
            'tagline': tagline,
            'director': directors,  
            'language': language,
            'budget': budget,
            'revenue': revenue,
            'normal_keyword_(rounded)': keyword_rounded,
            'tone_keyword_(bold)': keyword_bold,  
            'cast': cast_list,
            'reviews':  reviews_list,
            'content_score': content_score,
            'content_score_description': content_score_description 
        }

        page_data.append(movie_data)

    # Add current page data to all data
    all_page_data.extend(page_data)

    # Save after each page
    df = pd.DataFrame(all_page_data)
    df.to_csv(output_file, index=False)
    print(f"Saved data up to page {num}")        

KeyboardInterrupt: 

In [199]:
df.tails()

AttributeError: 'DataFrame' object has no attribute 'tails'

In [200]:
df['reviews'][0]

'[{\'writer\': \'John Chard\', \'score\': \'70%\', \'review\': "Spot the cameo in 5 Oscar winning epic. Based on the wonderful writing from Jules Verne, Around the World in Eighty Days is just shy of three hours, this was my first ever visit to the film and my reaction is mainly positive, though tempered with a small sense of unfulfillment. Phileas Fogg takes a wager from his fellow London club members that he can\'t circle the globe in 80 days, this it should be noted is 1872 where transport was not of the fast and dynamic variety. Fogg and his trusty servant Passepartout, set off on a journey that brings many adventures, and many humorous scrapes. They meet a wonderful array of characters and travel on many forms of transport, it is in short a magical journey. The production here from Mike Todd is gargantuan, the sets are incredible, the multiple locations befit the multi cast of actors that grace the film (have fun playing spot the star in this one). The costumes and the score are s

In [201]:
df.columns

Index(['movie_name', 'release_date', 'rating', 'genre', 'run_time',
       'certification', 'overview', 'tagline', 'director', 'language',
       'budget', 'revenue', 'normal_keyword_(rounded)', 'tone_keyword_(bold)',
       'cast', 'reviews', 'content_score', 'content_score_description'],
      dtype='object')

In [None]:
data = pd.read_csv('from_293_till_350.csv')

In [8]:
data.tail()

Unnamed: 0,movie_name,release_date,rating,genre,run_time,certification,overview,tagline,director,language,budget,revenue,normal_keyword_(rounded),tone_keyword_(bold),cast,reviews,content_score,content_score_description
675,Elysium,20 Aug 2013,65.0,"['Science Fiction', 'Action', 'Drama', 'Thrill...",1h 49m,15,"In the year 2159, two classes of people exist:...",He can save us all.,['Neill Blomkamp'],English,"$115,000,000.00","$286,140,700.00","['capitalism', 'future', 'dystopia', 'space st...",[],"['Matt Damon', 'Jodie Foster', 'Sharlto Copley...","[{'writer': 'CGGB', 'score': 'N/A', 'review': ...",100.0,Yes! Looking good!
676,Live and Let Die,06 Jul 1973,65.0,"['Adventure', 'Action', 'Thriller']",2h 1m,PG,James Bond must investigate a mysterious murde...,Bond is back. Back in action. Back with excite...,['Guy Hamilton'],English,"$7,000,000.00","$126,400,000.00","['london, england', 'new york city', 'bomb', '...",[],"['Roger Moore', 'Yaphet Kotto', 'Jane Seymour'...","[{'writer': 'Wuchak', 'score': '70%', 'review'...",100.0,Yes! Looking good!
677,Sputnik,26 Oct 2020,65.0,"['Science Fiction', 'Drama', 'Horror']",1h 53m,15,"At the height of the Cold War, a Soviet spacec...",The only survivor did not come back alone.,['Egor Abramenko'],Russian,"$2,600,000.00","$354,023.00","['spacecraft', 'alien life-form', 'murder', 'a...",[],"['Oksana Akinshina', 'Fyodor Bondarchuk', 'Pyo...","[{'writer': 'MovieGuys', 'score': 'N/A', 'revi...",100.0,Yes! Looking good!
678,Time Lapse,24 Nov 2014,65.0,"['Thriller', 'Science Fiction', 'Mystery', 'Ho...",1h 44m,12A,Three friends discover a mysterious machine th...,"Once you see the future, you can't look away.",['Bradley King'],English,-,"$19,572.00","['love triangle', 'gambling', 'journey in the ...",[],"['Danielle Panabaker', ""Matt O'Leary"", 'George...","[{'writer': 'Filipe Manuel Neto', 'score': '70...",100.0,Yes! Looking good!
679,The Invisible Guardian,03 Mar 2017,65.0,"['Thriller', 'Crime', 'Mystery']",2h 9m,16,When the naked body of a teenage girl is found...,,['Fernando González Molina'],Spanish; Castilian,"$5,000,000.00",-,"['fbi', 'murder']",[],"['Marta Etura', 'Elvira Mínguez', 'Francesc Or...",[],100.0,Yes! Looking good!


In [189]:
len(data)

4385