## Importing Libraries

In [12]:
import pandas as pd
import sqlite3


## Data Loading

In [13]:
def load_box_office_mojo_data(filepath):
    return pd.read_csv(filepath, compression='gzip')


In [14]:
def load_imdb_data(db_path):
    conn = sqlite3.connect(db_path)
    movie_basics = pd.read_sql_query("SELECT * FROM movie_basics", conn)
    movie_ratings = pd.read_sql_query("SELECT * FROM movie_ratings", conn)
    conn.close()
    return movie_basics, movie_ratings

In [15]:
import pandas as pd
import sqlite3
import os

def load_box_office_mojo_data(filepath):
    return pd.read_csv(filepath)

def load_imdb_data(db_path):
    conn = sqlite3.connect(db_path)
    movie_basics = pd.read_sql_query("SELECT * FROM movie_basics", conn)
    movie_ratings = pd.read_sql_query("SELECT * FROM movie_ratings", conn)
    conn.close()
    return movie_basics, movie_ratings

# Check the current working directory
print(f"Current working directory: {os.getcwd()}")

# Adjust the paths according to your folder structure
box_office_mojo_path = 'Data/bom.movie_gross.csv'
imdb_data_path = 'Data/im.db'

try:
    bom_data = load_box_office_mojo_data(box_office_mojo_path)
    movie_basics, movie_ratings = load_imdb_data(imdb_data_path)
    
    # Display the first few rows of the datasets
    print("Box Office Mojo Data:")
    display(bom_data.head())

    print("\nMovie Basics Data:")
    display(movie_basics.head())

    print("\nMovie Ratings Data:")
    display(movie_ratings.head())

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please ensure that the files 'bom.movie_gross.csv' and 'im.db' are located in the 'Data' directory.")


Current working directory: d:\DS-PROJECT ONE
Box Office Mojo Data:


Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010



Movie Basics Data:


Unnamed: 0,movie_id,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"



Movie Ratings Data:


Unnamed: 0,movie_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


### Data Cleaning

In [16]:
def clean_box_office_data(df):
    df = df.dropna(subset=['domestic_gross'])
    df['domestic_gross'] = df['domestic_gross'].replace('[\$,]', '', regex=True).astype(float)
    return df

def merge_imdb_data(movie_basics, movie_ratings):
    return pd.merge(movie_basics, movie_ratings, on='tconst', how='inner')


In [19]:
import pandas as pd
import sqlite3
import os


In [20]:
def load_box_office_mojo_data(filepath):
    return pd.read_csv(filepath)


In [21]:
# Function to load IMDb data
def load_imdb_data(db_path):
    conn = sqlite3.connect(db_path)
    movie_basics = pd.read_sql_query("SELECT * FROM movie_basics", conn)
    movie_ratings = pd.read_sql_query("SELECT * FROM movie_ratings", conn)
    conn.close()
    return movie_basics, movie_ratings

In [24]:
# Function to clean box office data
def clean_box_office_data(df):
    """
    Clean the box office data by dropping rows with missing values in the 'domestic_gross' column
    and converting the 'domestic_gross' column to float.
    
    Parameters:
    - df: DataFrame containing box office data
    
    Returns:
    - Cleaned DataFrame
    """
    # Drop rows with missing values in the 'domestic_gross' column
    df = df.dropna(subset=['domestic_gross'])
    
    # Remove any commas and dollar signs from the 'domestic_gross' column and convert to float
    df['domestic_gross'] = df['domestic_gross'].replace('[\$,]', '', regex=True).astype(float)
    
    return df

In [25]:

# Function to clean and merge IMDb data
def clean_imdb_data(movie_basics, movie_ratings):
    """
    Merge and clean IMDb movie data by merging the 'movie_basics' and 'movie_ratings' DataFrames
    on the 'tconst' column.
    
    Parameters:
    - movie_basics: DataFrame containing basic movie information
    - movie_ratings: DataFrame containing movie ratings
    
    Returns:
    - Cleaned DataFrame after merging
    """
    # Merge the movie_basics and movie_ratings DataFrames on the 'tconst' column
    merged_data = pd.merge(movie_basics, movie_ratings, on='tconst', how='inner')
    
    return merged_data

In [26]:
# Check the current working directory
print(f"Current working directory: {os.getcwd()}")

Current working directory: d:\DS-PROJECT ONE


In [27]:
# Adjust the paths according to your folder structure
box_office_mojo_path = 'Data/bom.movie_gross.csv'
imdb_data_path = 'Data/im.db'


In [36]:
def clean_box_office_data(df):
    # Create a copy of the DataFrame to avoid modifying the original data
    df_cleaned = df.copy()
    
    # Drop rows with missing values in 'domestic_gross' column
    df_cleaned = df_cleaned.dropna(subset=['domestic_gross'])
    
    # Replace characters '$' and ',' with an empty string, and convert to float
    df_cleaned['domestic_gross'] = df_cleaned['domestic_gross'].replace('[\$,]', '', regex=True).astype(float)
    
    return df_cleaned