In [None]:
import re
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

imdb_url = 'https://www.imdb.com/chart/top/'
driver.get(imdb_url)

driver.implicitly_wait(10)

imdb_page_source = driver.page_source
imdb_soup = BeautifulSoup(imdb_page_source, 'html.parser')

driver.quit()

imdb_container = imdb_soup.find('ul', {'class': 'ipc-metadata-list ipc-metadata-list--dividers-between sc-a1e81754-0 iyTDQy compact-list-view ipc-metadata-list--base'})
imdb_items = imdb_container.find_all('li', {'class': 'ipc-metadata-list-summary-item sc-4929eaf6-0 DLYcv cli-parent'})

In [None]:
movie_titles = []

for imdb_item in imdb_items:
    movie_title = imdb_item.find('h3', {'class': 'ipc-title__text'}).text.strip()
    movie_titles.append(movie_title)

cleaned_movie_titles = [movie_title.split('. ', 1)[1] for movie_title in movie_titles]

In [None]:
with open('omdb_apikey.txt', 'r') as file:
    omdb_apikey = file.read().strip()

# http://www.omdbapi.com/?apikey=YOURAPIKEYHERE&t=MOVIETITLEHERE&type=movie&plot=full
omdb_base_url = 'http://www.omdbapi.com/'

movie_details = []

for cleaned_movie_title in cleaned_movie_titles:
    params = {
        'apikey': omdb_apikey,
        't': cleaned_movie_title
    }

    movie_response = requests.get(omdb_base_url, params=params)

    if movie_response.status_code == 200:
        movie_detail = movie_response.json()
        print(f'Successfully fetched movie details for: {cleaned_movie_title}')

        movie_details.append(movie_detail)
    else:
        print(f'OMDB API request failed for: {cleaned_movie_title} - Status Code: {movie_response.status_code}')  

In [None]:
def extract_rating(ratings, source):
    for rating in ratings:
        if rating['Source'] == source:
            return rating['Value']
    return None

movies_data = []

for movie_detail in movie_details:
    movies_data.append({
        'Title': movie_detail.get('Title'),
        'Year': movie_detail.get('Year'),
        'Genre': movie_detail.get('Genre'),
        'Awards': movie_detail.get('Awards'),
        'IMDb Rating': movie_detail.get('imdbRating'),
        'Rotten Tomatoes Tomatometer': extract_rating(movie_detail.get('Ratings', []), 'Rotten Tomatoes'),
        'Metacritic Metascore': movie_detail.get('Metascore'),
        'Box Office': movie_detail.get('BoxOffice')
    })

movies_df = pd.DataFrame(movies_data)
print(movies_df)

In [None]:
def extract_awards(awards_str):
    if pd.isnull(awards_str):
        return 0, 0
    
    total_wins = 0
    total_nominations = 0

    wins_match = re.search(r'(\d+) win', awards_str)
    if wins_match:
        total_wins = int(wins_match.group(1))

    nominations_match = re.search(r'(\d+) nomination', awards_str)
    if nominations_match:
        total_nominations = int(nominations_match.group(1))
    
    return total_wins, total_nominations

movies_df[['Total Wins', 'Total Nominations']] = movies_df['Awards'].apply(lambda x: pd.Series(extract_awards(x)))
movies_df = movies_df.drop(columns=['Awards'])

print(movies_df)
movies_df.to_csv('IMDb Top 250 Movies.csv', index=False)

In [None]:
movies_df['Genre List'] = movies_df['Genre'].str.split(', ')

all_genres = movies_df.explode('Genre List')['Genre List']

genre_counts = all_genres.value_counts()

plt.figure(figsize=(10, 6))
genre_counts.plot(kind='bar', color='skyblue')
plt.title('Most Common Genres in IMDb Top 250 Movies')
plt.ylabel('Count')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.savefig('Most Common Genres in IMDb Top 250 Movies.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
movies_df['IMDb Rating'] = pd.to_numeric(movies_df['IMDb Rating'], errors='coerce')

genre_ratings = movies_df.explode('Genre List').groupby('Genre List')['IMDb Rating'].mean()

plt.figure(figsize=(10, 6))
genre_ratings.sort_values(ascending=False).head(10).plot(kind='bar', color='orange')
plt.title('Top 10 Genres by Average IMDb Rating')
plt.ylabel('Average IMDb Rating')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.ylim(8, 9)
plt.savefig('Top 10 Genres by Average IMDb Rating.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
movies_df['Box Office'] = movies_df['Box Office'].str.replace('[\$,]', '', regex=True)

movies_df['Box Office'] = pd.to_numeric(movies_df['Box Office'], errors='coerce')

# movies_df['Box Office'].fillna(0, inplace=True)
movies_df = movies_df.dropna(subset=['Box Office'])

genre_boxoffice = movies_df.explode('Genre List').groupby('Genre List')['Box Office'].mean()

plt.figure(figsize=(10, 6))
genre_boxoffice.sort_values(ascending=False).head(10).plot(kind='bar', color='lightgreen')
plt.title('Top 10 Genres by Average Box Office')
plt.ylabel('Average Box Office')
plt.xlabel('Genre')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.savefig('Top 10 Genres by Average Box Office.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print('Summary Statistics for IMDb Rating:')
print(movies_df['IMDb Rating'].describe())

movies_df['Rotten Tomatoes Tomatometer'] = movies_df['Rotten Tomatoes Tomatometer'].str.replace('[\%]', '', regex=True)
movies_df['Rotten Tomatoes Tomatometer'] = pd.to_numeric(movies_df['Rotten Tomatoes Tomatometer'], errors='coerce')
movies_df['Rotten Tomatoes Tomatometer'].replace('', pd.NA, inplace=True)
movies_df = movies_df.dropna(subset=['Rotten Tomatoes Tomatometer'])
print('Summary Statistics for Rotten Tomatoes Tomatometer:')
print(movies_df['Rotten Tomatoes Tomatometer'].describe())

movies_df['Metacritic Metascore'] = pd.to_numeric(movies_df['Metacritic Metascore'], errors='coerce')
movies_df = movies_df.dropna(subset=['Metacritic Metascore'])
print('Summary Statistics for Metacritic Metascore:')
print(movies_df['Metacritic Metascore'].describe())

print('Summary Statistics for Box Office:')
print(movies_df['Box Office'].describe())

print('Summary Statistics for Total Wins:')
print(movies_df['Total Wins'].describe())

print('Summary Statistics for Total Nominations:')
print(movies_df['Total Nominations'].describe())

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='IMDb Rating', y='Box Office', data=movies_df, color='blue')
plt.title('Box Office vs. IMDb Rating')
plt.xlabel('IMDb Rating')
plt.ylabel('Box Office (in dollars)')
plt.grid(True)
plt.savefig('Box Office vs. IMDb Rating.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Rotten Tomatoes Tomatometer', y='Box Office', data=movies_df, color='red')
plt.title('Box Office vs. Rotten Tomatoes Tomatometer')
plt.xlabel('Rotten Tomatoes Tomatometer')
plt.ylabel('Box Office (in dollars)')
plt.grid(True)
plt.savefig('Box Office vs. Rotten Tomatoes Tomatometer.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Metacritic Metascore', y='Box Office', data=movies_df, color='green')
plt.title('Box Office vs. Metacritic Metascore')
plt.xlabel('Metacritic Metascore')
plt.ylabel('Box Office (in dollars)')
plt.grid(True)
plt.savefig('Box Office vs. Metacritic Metascore.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
numeric_variables = ['IMDb Rating', 'Rotten Tomatoes Tomatometer', 'Metacritic Metascore', 'Box Office', 'Total Wins', 'Total Nominations']
correlation_matrix = movies_df[numeric_variables].corr()

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Analysis Between Numeric Variables')
plt.savefig('Correlation Analysis Between Numeric Variables.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
movies_df['Year'] = pd.to_numeric(movies_df['Year'], errors='coerce')

movies_df['Decade'] = (movies_df['Year'] // 10) * 10

plt.figure(figsize=(10, 6))
movies_df['Decade'].value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Distribution of IMDb Top 250 Movies by Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.savefig('Distribution of IMDb Top 250 Movies by Decade.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
movies_df['Rotten Tomatoes Tomatometer'] = movies_df['Rotten Tomatoes Tomatometer'] / 10
movies_df['Metacritic Metascore'] = movies_df['Metacritic Metascore'] / 10

imdb_ratings_by_decade = movies_df.groupby('Decade')['IMDb Rating'].mean()
rotten_tomatoes_tomatometer_by_decade = movies_df.groupby('Decade')['Rotten Tomatoes Tomatometer'].mean()
metacritic_metascore_by_decade = movies_df.groupby('Decade')['Metacritic Metascore'].mean()

plt.figure(figsize=(10, 6))
plt.plot(imdb_ratings_by_decade.index, imdb_ratings_by_decade, marker='o', label='IMDb Rating', color='orange')
plt.plot(rotten_tomatoes_tomatometer_by_decade.index, rotten_tomatoes_tomatometer_by_decade, marker='o', label='Rotten Tomatoes', color='red')
plt.plot(metacritic_metascore_by_decade.index, metacritic_metascore_by_decade, marker='o', label='Metacritic', color='blue')
plt.title('Average Rating by Decade')
plt.xlabel('Decade')
plt.ylabel('Average Rating')
plt.grid()
plt.legend()
plt.savefig('Average Rating by Decade.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
omdb_base_url = 'http://www.omdbapi.com/'

params = {
    'apikey': omdb_apikey,
    't': 'Memories of Murder'
}

movie_response = requests.get(omdb_base_url, params=params)

movie_response.json()