In [27]:
import pandas as pd
import json
import re

def load_imdb_data(file_path):
    """Load and process IMDb-style JSON data"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading IMDb data: {e}")
        return pd.DataFrame()
    
    df = pd.DataFrame(data)
    
    # Check if required columns exist
    if df.empty:
        print("IMDb data is empty")
        return df
    
    # Clean and transform data
    df = df.rename(columns={
        'duration': 'runtime',
        'rating': 'score'
    })
    
    # Convert duration to minutes (improved regex handling)
    def parse_runtime(runtime_str):
        """
        Parse runtime string and convert to total minutes
        
        Args:
            runtime_str (str): Runtime string in format like '2h 30m'
        
        Returns:
            int: Total runtime in minutes
        """
        if pd.isna(runtime_str):
            return 0
        
        # Default to 0 if no conversion possible
        total_minutes = 0
        
        # Extract hours
        hours_match = re.search(r'(\d+)h', str(runtime_str))
        hours = int(hours_match.group(1)) * 60 if hours_match else 0
        
        # Extract minutes
        minutes_match = re.search(r'(\d+)m', str(runtime_str))
        minutes = int(minutes_match.group(1)) if minutes_match else 0
        
        return hours + minutes

    if 'runtime' in df.columns:
        df['runtime_min'] = df['runtime'].apply(parse_runtime)
    
    # Add source identifier
    df['source'] = 'IMDb'
    
    # Convert scraped_at to datetime (with error handling)
    if 'scraped_at' in df.columns:
        df['scraped_at'] = pd.to_datetime(df['scraped_at'], errors='coerce')
    
    # Clean numerical columns
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    df['score'] = pd.to_numeric(df['score'], errors='coerce')
    
    return df

def load_tmdb_data(file_path):
    """Load and process TMDB-style JSON data"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading TMDB data: {e}")
        return pd.DataFrame()
    
    df = pd.DataFrame(data)
    
    # Check if required columns exist
    if df.empty:
        print("TMDB data is empty")
        return df
    
    # Clean and transform data
    df = df.rename(columns={
        'runtime': 'duration',
        'score': 'rating'
    })
    
    # Process director information (with error handling)
    def get_director_name(directors):
        try:
            return directors[0]['name'] if directors else None
        except (IndexError, KeyError):
            return None

    def get_director_role(directors):
        try:
            return directors[0]['role'] if directors else None
        except (IndexError, KeyError):
            return None

    df['director_name'] = df['directors'].apply(get_director_name)
    df['director_role'] = df['directors'].apply(get_director_role)
    
    # Convert runtime to minutes (improved regex handling)
    def parse_runtime(runtime_str):
        """
        Parse runtime string and convert to total minutes
        
        Args:
            runtime_str (str): Runtime string in format like '2h 30m'
        
        Returns:
            int: Total runtime in minutes
        """
        if pd.isna(runtime_str):
            return 0
        
        # Default to 0 if no conversion possible
        total_minutes = 0
        
        # Extract hours
        hours_match = re.search(r'(\d+)h', str(runtime_str))
        hours = int(hours_match.group(1)) * 60 if hours_match else 0
        
        # Extract minutes
        minutes_match = re.search(r'(\d+)m', str(runtime_str))
        minutes = int(minutes_match.group(1)) if minutes_match else 0
        
        return hours + minutes

    if 'duration' in df.columns:
        df['runtime_min'] = df['duration'].apply(parse_runtime)
    
    # Add source identifier
    df['source'] = 'TMDB'
    
    # Clean numerical columns
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    
    # Handle missing posters
    df['poster'] = df['poster'].replace('N/A', pd.NA)
    
    # Drop directors column
    return df.drop(columns=['directors'])

def combine_datasets(imdb_path, tmdb_path):
    """Combine both datasets into a unified DataFrame"""
    imdb_df = load_imdb_data(imdb_path)
    tmdb_df = load_tmdb_data(tmdb_path)
    
    # Check if both dataframes are valid
    if imdb_df.empty or tmdb_df.empty:
        print("One or both datasets are empty")
        return pd.DataFrame()
    
    # Align columns between datasets
    all_columns = set(list(imdb_df.columns) + list(tmdb_df.columns))
    
    for col in all_columns:
        if col not in imdb_df.columns:
            imdb_df[col] = pd.NA
        if col not in tmdb_df.columns:
            tmdb_df[col] = pd.NA
    
    # Combine datasets
    combined_df = pd.concat([imdb_df, tmdb_df], ignore_index=True)
    
    # Standardize column order
    column_order = [
        'source', 'title', 'year', 'rating', 'score', 'genres',
        'runtime', 'runtime_min', 'director_name', 'director_role',
        'actors', 'description', 'poster', 'url', 'scraped_at'
    ]
    
    # Ensure all columns in column_order exist in the DataFrame
    existing_columns = [col for col in column_order if col in combined_df.columns]
    
    return combined_df.reindex(columns=existing_columns)

# Example usage

In [29]:

imdb_path = "data_1.json"
tmdb_path = "data_2.json"

final_df = combine_datasets(imdb_path, tmdb_path)

# Display results
print("Combined DataFrame Shape:", final_df.shape)
print("\nFirst 3 rows:")
print(final_df.head(3))


Combined DataFrame Shape: (2724, 15)

First 3 rows:
  source        title  year rating  score  \
0   IMDb  City of God  2002   <NA>    8.6   
1   IMDb       Wicked  2024   <NA>    7.3   
2   IMDb  Last Breath  2025   <NA>    7.4   

                                              genres runtime  runtime_min  \
0     [Caper, Coming-of-Age, Gangster, Crime, Drama]  2h 10m          130   
1  [Fairy Tale, Pop Musical, Fantasy, Musical, Ro...  2h 40m          160   
2                   [Sea Adventure, Drama, Thriller]  1h 33m           93   

  director_name director_role  \
0           NaN           NaN   
1           NaN           NaN   
2           NaN           NaN   

                                              actors  \
0  [Alexandre Rodrigues, Leandro Firmino, Matheus...   
1  [Cynthia Erivo, Ariana Grande, Jeff Goldblum, ...   
2  [Woody Harrelson, Simu Liu, Finn Cole, Cliff C...   

                                         description poster  \
0  In the slums of Rio, two kids' pat

  combined_df = pd.concat([imdb_df, tmdb_df], ignore_index=True)


In [35]:
final_df.head()

Unnamed: 0,source,title,year,rating,score,genres,runtime,runtime_min,director_name,director_role,actors,description,poster,url,scraped_at
0,IMDb,City of God,2002,,8.6,"[Caper, Coming-of-Age, Gangster, Crime, Drama]",2h 10m,130,,,"[Alexandre Rodrigues, Leandro Firmino, Matheus...","In the slums of Rio, two kids' paths diverge a...",,https://www.imdb.com/title/tt0317248/?ref_=cht...,2025-03-19 23:15:36.598273
1,IMDb,Wicked,2024,,7.3,"[Fairy Tale, Pop Musical, Fantasy, Musical, Ro...",2h 40m,160,,,"[Cynthia Erivo, Ariana Grande, Jeff Goldblum, ...","Elphaba, a young woman ridiculed for her green...",,https://www.imdb.com/title/tt1262426/?ref_=cht...,2025-03-19 23:15:39.175598
2,IMDb,Last Breath,2025,,7.4,"[Sea Adventure, Drama, Thriller]",1h 33m,93,,,"[Woody Harrelson, Simu Liu, Finn Cole, Cliff C...",A true story that follows seasoned deep-sea di...,,https://www.imdb.com/title/tt14403504/?ref_=ch...,2025-03-19 23:15:40.768481
3,IMDb,Gladiator II,2024,,8.5,"[Action Epic, Adventure Epic, Epic, Period Dra...",2h 28m,148,,,"[Paul Mescal, Denzel Washington, Pedro Pascal,...",After his home is conquered by the tyrannical ...,,https://www.imdb.com/title/tt9218128/?ref_=sr_...,2025-03-19 23:15:43.311106
4,IMDb,Suits,2011,,8.1,"[Legal Drama, Workplace Drama, Comedy, Drama]",,0,,,"[Gabriel Macht, Patrick J. Adams, Meghan Markl...","On the run from a drug deal gone bad, brillian...",,https://www.imdb.com/title/tt1632701/?ref_=sr_...,2025-03-19 23:15:46.232745


In [37]:
import ast
def process_combined_dataset(combined_movies):
    """
    Process the combined dataset by:
    1. Removing duplicates by title
    2. Expanding genres into individual columns
    
    Args:
        combined_movies (pd.DataFrame): Combined movie dataset
    
    Returns:
        pd.DataFrame: Processed dataset
    """
    # Remove duplicates by title, keeping the first occurrence
    df = combined_movies.drop_duplicates(subset=['title'], keep='first')
    
    # Safely convert genres to list if it's a string representation
    def parse_genres(genres):
        try:
            # If it's already a list, return it
            if isinstance(genres, list):
                return genres
            # If it's a string representation of a list, use ast.literal_eval
            elif isinstance(genres, str) and genres.startswith('['):
                return ast.literal_eval(genres)
            # If it's None or empty, return an empty list
            else:
                return []
        except:
            return []
    
    # Parse genres
    df['parsed_genres'] = df['genres'].apply(parse_genres)
    
    # Get unique genres across all movies
    all_genres = set()
    for genre_list in df['parsed_genres']:
        all_genres.update(genre_list)
    
    # Create one-hot encoded columns for genres
    for genre in sorted(all_genres):
        # Create column name that's safe for Python (lowercase, replace spaces)
        col_name = f'genre_{genre.lower().replace(" ", "_")}'
        df[col_name] = df['parsed_genres'].apply(lambda x: 1 if genre in x else 0)
    
    # Optional: Drop the original and parsed genres columns if you want
    df = df.drop(columns=['genres', 'parsed_genres'])
    
    return df


In [39]:
df_clean=process_combined_dataset(final_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['parsed_genres'] = df['genres'].apply(parse_genres)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df['parsed_genres'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df['parsed_genres'].apply(lambda x: 1 if genre in x else 0)


In [41]:
df_clean

Unnamed: 0,source,title,year,rating,score,runtime,runtime_min,director_name,director_role,actors,...,genre_true_crime,genre_urban_adventure,genre_vampire_horror,genre_war,genre_war_epic,genre_western,genre_western_epic,genre_whodunnit,genre_workplace_drama,genre_zombie_horror
0,IMDb,City of God,2002,,8.6,2h 10m,130,,,"[Alexandre Rodrigues, Leandro Firmino, Matheus...",...,0,0,0,0,0,0,0,0,0,0
1,IMDb,Wicked,2024,,7.3,2h 40m,160,,,"[Cynthia Erivo, Ariana Grande, Jeff Goldblum, ...",...,0,0,0,0,0,0,0,0,0,0
2,IMDb,Last Breath,2025,,7.4,1h 33m,93,,,"[Woody Harrelson, Simu Liu, Finn Cole, Cliff C...",...,0,0,0,0,0,0,0,0,0,0
3,IMDb,Gladiator II,2024,,8.5,2h 28m,148,,,"[Paul Mescal, Denzel Washington, Pedro Pascal,...",...,0,0,0,0,0,0,0,0,0,0
4,IMDb,Suits,2011,,8.1,,0,,,"[Gabriel Macht, Patrick J. Adams, Meghan Markl...",...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2719,TMDB,Jackass Forever,2022,68,,,96,Jeff Tremaine,"Director, Writer",,...,0,0,0,0,0,0,0,0,0,0
2720,TMDB,The Best Years,2020,68,,,129,Gabriele Muccino,"Director, Screenplay, Story",,...,0,0,0,0,0,0,0,0,0,0
2721,TMDB,Ashfall,2019,68,,,128,Kim Byung-seo,"Director, Writer",,...,0,0,0,0,0,0,0,0,0,0
2722,TMDB,Kung Fu Jungle,2014,68,,,100,Teddy Chan Tak-Sum,"Director, Story",,...,0,0,0,0,0,0,0,0,0,0


In [43]:
final_df.to_csv('sample_data.csv')

In [50]:
import pandas as pd
from ast import literal_eval

# Load data
df = pd.read_csv("sample_data.csv")

# Handle missing values before conversions
df['genres'] = df['genres'].fillna("[]").apply(literal_eval)  # Handle NaN in genres
df['actors'] = df['actors'].fillna("[]").apply(literal_eval)   # Handle NaN in actors
df['director_name'] = df['director_name'].fillna("")          # Handle NaN in director
df['description'] = df['description'].fillna("")              # Handle NaN in description

# Create combined feature
df['combined_features'] = df.apply(
    lambda row: ' '.join(row['genres']) + ' ' +
                row['director_name'] + ' ' +
                ' '.join(row['actors']) + ' ' +
                row['description'],
    axis=1
)

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Recommendation function
def recommend_movies(title, cosine_sim=cosine_sim, df=df):
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 5 recommendations
    movie_indices = [i[0] for i in sim_scores]
    return df[['title', 'genres', 'score', 'year']].iloc[movie_indices]

# Test: Recommend movies similar to "City of God"
print(recommend_movies("City of God"))

                        title                     genres  score  year
2173              7 Prisoners             [Drama, Crime]    NaN  2021
899   My Mom Is a Character 2                   [Comedy]    NaN  2016
1998   All That Heaven Allows           [Drama, Romance]    NaN  1955
2666                    Them!  [Science Fiction, Horror]    NaN  1954
2277                 Blue Jay           [Drama, Romance]    NaN  2016


In [54]:
# Test recommendations for "City of God"
test_movie = "City of God"
recommendations = recommend_movies(test_movie)
print(f"Movies similar to {test_movie}:")
print(recommendations)

Movies similar to City of God:
                        title                     genres  score  year
2173              7 Prisoners             [Drama, Crime]    NaN  2021
899   My Mom Is a Character 2                   [Comedy]    NaN  2016
1998   All That Heaven Allows           [Drama, Romance]    NaN  1955
2666                    Them!  [Science Fiction, Horror]    NaN  1954
2277                 Blue Jay           [Drama, Romance]    NaN  2016
