In [3]:
# Imports
import os
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
# Load datasets
tmdb_movie_ids = pd.read_csv("../data/tmdb/01_movie_ids.csv", sep=";")
tmdb_movie_features = pd.read_csv("../data/tmdb/02_movie_features.csv", sep=";")

In [3]:
# Check a few observations
tmdb_movie_ids.head()

Unnamed: 0,ID,GENRE_IDS,RELEASE_DATE,VOTE_COUNT,VOTE_AVERAGE,TITLE
0,993710,"[28, 35]",2025-01-15,866,6.611,Back in Action
1,1156593,"[10749, 18]",2024-12-26,1020,7.128,Your Fault
2,1097549,[18],2024-12-25,416,5.75,Babygirl
3,426063,"[27, 14]",2024-12-25,1986,6.662,Nosferatu
4,939243,"[28, 878, 35, 10751]",2024-12-19,1644,7.8,Sonic the Hedgehog 3


In [4]:
# Check a few observations
tmdb_movie_features.head()

Unnamed: 0,ID,IMDB_ID,GENRE_ID,GENRE_NAME,RELEASE_DATE,STATUS,TITLE,BUDGET,REVENUE,RUNTIME,VOTE_COUNT,VOTE_AVERAGE,POPULARITY,PRODUCTION_COMPANY_ID,PRODUCTION_COMPANY_NAME,PRODUCTION_COMPANY_ORIGIN_COUNTRY,PRODUCTION_COUNTRY_NAME,SPOKEN_LANGUAGES
0,993710,tt21191806,"28, 35","Action, Comedy",2025-01-15,Released,Back in Action,0,0,114,855,6.63,2134.078,"7076, 121737, 228007","Chernin Entertainment, Exhibit A, Good One","US, US, US",United States of America,"Polish, English"
1,1156593,tt28510079,"10749, 18","Romance, Drama",2024-12-26,Released,Your Fault,0,0,118,1016,7.125,493.501,"32485, 210099","Pokeepsie Films, Amazon MGM Studios","ES, US","Spain, United States of America",Spanish
2,1097549,tt30057084,18,Drama,2024-12-25,Released,Babygirl,20000000,47756311,115,413,5.8,1021.698,"41077, 158407, 121407","A24, 2AM, Man Up Film","US, US, NL","United States of America, Netherlands",English
3,426063,tt5040012,"27, 14","Horror, Fantasy",2024-12-25,Released,Nosferatu,50000000,173394574,133,1981,6.658,1189.734,"10146, 77863, 41641, 239430","Focus Features, Studio 8, Maiden Voyage Pictur...","US, US, US, US",United States of America,"English, German, Romanian, Russian, Spanish"
4,939243,tt18259086,"28, 878, 35, 10751","Action, Science Fiction, Comedy, Family",2024-12-19,Released,Sonic the Hedgehog 3,122000000,471749087,110,1643,7.8,5620.604,"4, 333, 77884, 113750, 10644, 168701","Paramount Pictures, Original Film, Marza Anima...","US, US, JP, JP, US, US","United States of America, Japan",English


In [5]:
# Check column types and missing values
tmdb_movie_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ID            1205 non-null   int64  
 1   GENRE_IDS     1205 non-null   object 
 2   RELEASE_DATE  1205 non-null   object 
 3   VOTE_COUNT    1205 non-null   int64  
 4   VOTE_AVERAGE  1205 non-null   float64
 5   TITLE         1205 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 56.6+ KB


In [6]:
# Check column types and missing values
tmdb_movie_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 18 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 1205 non-null   int64  
 1   IMDB_ID                            1202 non-null   object 
 2   GENRE_ID                           1205 non-null   object 
 3   GENRE_NAME                         1205 non-null   object 
 4   RELEASE_DATE                       1205 non-null   object 
 5   STATUS                             1205 non-null   object 
 6   TITLE                              1205 non-null   object 
 7   BUDGET                             1205 non-null   int64  
 8   REVENUE                            1205 non-null   int64  
 9   RUNTIME                            1205 non-null   int64  
 10  VOTE_COUNT                         1205 non-null   int64  
 11  VOTE_AVERAGE                       1205 non-null   float

In [7]:
# Drop unwated columns
tmdb_movie_ids.drop(columns=["RELEASE_DATE", "GENRE_IDS", "VOTE_COUNT", "VOTE_AVERAGE", "TITLE"], inplace=True)
tmdb_movie_features.drop(columns=["IMDB_ID", "GENRE_ID", "PRODUCTION_COMPANY_ID", "PRODUCTION_COMPANY_ORIGIN_COUNTRY"], inplace=True)

In [8]:
# Merge data
data = pd.merge(tmdb_movie_ids, tmdb_movie_features, left_on="ID", right_on="ID", how="inner")

# Drop IDs post merge
data.drop(columns=["ID"], inplace=True)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   GENRE_NAME               1205 non-null   object 
 1   RELEASE_DATE             1205 non-null   object 
 2   STATUS                   1205 non-null   object 
 3   TITLE                    1205 non-null   object 
 4   BUDGET                   1205 non-null   int64  
 5   REVENUE                  1205 non-null   int64  
 6   RUNTIME                  1205 non-null   int64  
 7   VOTE_COUNT               1205 non-null   int64  
 8   VOTE_AVERAGE             1205 non-null   float64
 9   POPULARITY               1205 non-null   float64
 10  PRODUCTION_COMPANY_NAME  1198 non-null   object 
 11  PRODUCTION_COUNTRY_NAME  1201 non-null   object 
 12  SPOKEN_LANGUAGES         1203 non-null   object 
dtypes: float64(2), int64(4), object(7)
memory usage: 122.5+ KB


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   GENRE_NAME               1205 non-null   object 
 1   RELEASE_DATE             1205 non-null   object 
 2   STATUS                   1205 non-null   object 
 3   TITLE                    1205 non-null   object 
 4   BUDGET                   1205 non-null   int64  
 5   REVENUE                  1205 non-null   int64  
 6   RUNTIME                  1205 non-null   int64  
 7   VOTE_COUNT               1205 non-null   int64  
 8   VOTE_AVERAGE             1205 non-null   float64
 9   POPULARITY               1205 non-null   float64
 10  PRODUCTION_COMPANY_NAME  1198 non-null   object 
 11  PRODUCTION_COUNTRY_NAME  1201 non-null   object 
 12  SPOKEN_LANGUAGES         1203 non-null   object 
dtypes: float64(2), int64(4), object(7)
memory usage: 122.5+ KB


In [None]:
# Saving dataset for future use
# os.chdir("..")
# from src.utils import save_dataframe

def save_dataframe(df, filename="file.csv", directory="..\data", sep=",", index=False):
    """
    Saves a Pandas dataframe to a CSV file.

    Parameters:
    df (pd.DataFrame): Dataset to save.
    filename (str): The name of the output CSV file (default: 'file.csv').
    directory (str): The folder where the file should be saved (default: '../data').
    sep (str): The separator for the CSV file (default: ',').
    index (bool): Whether to include the index in the saved file (default: False).
    
    Returns:
    None
    """
    try:
        # Ensure the directory exists
        os.makedirs(directory, exist_ok=True)

        # Full file path
        filepath = os.path.join(directory, filename)

        # Save DataFrame
        df.to_csv(filepath, sep=sep, index=index)
        print(f"✅ Data successfully saved to {filepath} with separator '{sep}'")
    
    except Exception as e:
        print(f"❌ Error saving file: {e}")


save_dataframe(data, "01_omdb_data_clean.csv")

✅ Data successfully saved to ..\data\01_clean_data.csv with separator ','
