# Scraping all movies

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

In [None]:
# URL pattern for the pages
url_pattern = 'https://www.metacritic.com/browse/movies/score/metascore/all/filtered?view=condensed&page={}'

# Initialize lists to store the extracted values
movie_names = []
movie_hrefs = []

# Iterate over the page numbers
for page_number in range(155):
    # Create the URL for the current page
    url = url_pattern.format(page_number)

    # Set headers to mimic a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers, allow_redirects=False)

    # Check if the request was successful
    if response.status_code == 200:
        # Create a BeautifulSoup object from the response content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the div with the specified class
        div = soup.find('div', {'class': 'title_bump'})

        # Extract movie names and hrefs
        if div:
            titles = div.find_all('a', class_='title')
            for title in titles:
                # Find the <h3> element within the <a> element with the class "title"
                h3_element = title.find('h3')
                # Extract the movie name
                movie_name = h3_element.get_text(strip=True)
                # Extract the href attribute value
                movie_href = title['href']
                # Append the values to the respective lists
                movie_names.append(movie_name)
                movie_hrefs.append(movie_href)
        else:
            print(f"Div not found on page {page_number}")
    else:
        print(f"Request failed on page {page_number}")

    # Add a delay of 5 seconds
    time.sleep(5)

# Create a DataFrame from the extracted values
df_movies = pd.DataFrame({'Movie Name': movie_names, 'Movie Href': movie_hrefs})
df_movies.to_csv('movies_href.csv', index=False)

In [None]:
# Perform data transformation on the href column
df_movies['Movie Href'] = df_movies['Movie Href'].apply(lambda x: 'https://www.metacritic.com' + x)

# Movie Metadata

In [None]:
def find_movie_title(soup):
    try:
        movie_title = soup.find('h1').get_text(strip=True)
        return movie_title
    except AttributeError:
        return "null"

def find_director(soup):
    try:
        director_div = soup.find('div', class_='director')
        director_name = director_div.a.span.text
        return director_name
    except AttributeError:
        return "null"

def find_genres(soup):
    try:
        genres_div = soup.find('div', class_='genres')
        genres_spans = genres_div.find_all('span')
        genres = [span.text for span in genres_spans if span.text != ',']
        combined_string = ' '.join(genres)
        combined_string = re.sub(r'\s+', ' ', combined_string.strip())
        unique_words = re.findall(r'\b\w+\b', combined_string)
        unique_genres = set(unique_words)
        return unique_genres
    except AttributeError:
        return "null"

def find_rating(soup):
    try:
        rating_div = soup.find('div', class_='rating')
        rating = rating_div.get_text(strip=True).split(':')[1]
        return rating
    except AttributeError:
        return "null"

def find_runtime(soup):
    try:
        runtime_div = soup.find('div', class_='runtime')
        runtime = runtime_div.get_text(strip=True).split(':')[1]
        return runtime
    except AttributeError:
        return "null"

def find_cast(soup):
    try:
        cast_div = soup.find('div', class_='summary_cast details_section')
        cast_name = cast_div.get_text(strip=True).split(':')[1]
        return cast_name
    except AttributeError:
        return "null"

def find_distributor(soup):
    try:
        distributor_div = soup.find('span', class_='distributor')
        distributor_name = distributor_div.find('a').text
        return distributor_name
    except AttributeError:
        return "null"

def find_release_date(soup):
    try:
        release_date_div = soup.find('span', class_='release_date')
        release_date = release_date_div.get_text(strip=True).split(':')[1]
        return release_date
    except AttributeError:
        return "null"

def find_summary(soup):
    try:
        summary_div = soup.find('span', class_='blurb blurb_expanded')
        summary = summary_div.get_text(strip=True).split(':')
        return summary
    except AttributeError:
        return "null"

def find_metascore(soup):
    try:
        metascore_div = soup.find_all(class_='score fl')[0]
        metascore = metascore_div.text.strip()
        return metascore
    except AttributeError:
        return "null"

def find_metascore_ratings(soup):
    try:
        metascore_positive_count = soup.find_all(class_='chart positive')[0].find(class_='count fr').text
        metascore_mixed_count = soup.find_all(class_='chart mixed')[0].find(class_='count fr').text
        metascore_negative_count = soup.find_all(class_='chart negative')[0].find(class_='count fr').text
        return metascore_positive_count, metascore_mixed_count, metascore_negative_count
    except AttributeError:
        return "null", "null", "null"

def find_userscore(soup):
    try:
        user_score_div = soup.find_all(class_='score fl')[1]
        user_score = user_score_div.text.strip()
        return user_score
    except AttributeError:
        return "null"

def find_userscore_ratings(soup):
    try:
        user_score_positive_count = soup.find_all(class_='chart positive')[1].find(class_='count fr').text
        user_score_mixed_count = soup.find_all(class_='chart mixed')[1].find(class_='count fr').text
        user_score_negative_count = soup.find_all(class_='chart negative')[1].find(class_='count fr').text
        return user_score_positive_count, user_score_mixed_count, user_score_negative_count
    except AttributeError:
        return "null", "null", "null"


In [None]:
# Define the function to scrape data from a single URL
def scrape_movie_data(url):
    try:
        user_agent = {'User-agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=user_agent)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Call the functions to extract the data
        title = find_movie_title(soup)
        director = find_director(soup)
        genres = find_genres(soup)
        rating = find_rating(soup)
        runtime = find_runtime(soup)
        cast = find_cast(soup)
        distributor = find_distributor(soup)
        release_date = find_release_date(soup)
        summary = find_summary(soup)
        metascore = find_metascore(soup)
        metascore_positive, metascore_mixed, metascore_negative = find_metascore_ratings(soup)
        userscore = find_userscore(soup)
        userscore_positive, userscore_mixed, userscore_negative = find_userscore_ratings(soup)

        # Create a dictionary with the extracted data
        data = {
            'Title': [title],
            'Director': [director],
            'Genres': [genres],
            'Rating': [rating],
            'Runtime': [runtime],
            'Cast': [cast],
            'Distributor': [distributor],
            'Release Date': [release_date],
            'Summary': [summary],
            'Metascore': [metascore],
            'Metascore Positive': [metascore_positive],
            'Metascore Mixed': [metascore_mixed],
            'Metascore Negative': [metascore_negative],
            'Userscore': [userscore],
            'Userscore Positive': [userscore_positive],
            'Userscore Mixed': [userscore_mixed],
            'Userscore Negative': [userscore_negative]
        }

        return data
    except:
        # If an exception occurs (e.g., missing URL)
        return None

# Create an empty list to store the data from each URL
all_data = []

# Iterate over the URLs in the 'Movie Href' column
for url in df_movies['Movie Href']:
    # Call the function to scrape data from the URL
    data = scrape_movie_data(url)
    
    if data is not None:
        all_data.append(data)

     # Add a wait time of 5 seconds before the next request
    time.sleep(3)

# Create a DataFrame from the obtained data
df = pd.concat([pd.DataFrame(d) for d in all_data], ignore_index=True)
df.to_csv('movies_metadata_metacritic.csv', index=False)