# Imports

In [1]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
from functions import load_and_clean_all_data, quality_of_movies_by_country

# Data loading

In [2]:
# Define the data directory and output directory
data_dir = 'data_imdb'
output_dir = 'cleaned_data'

# Load and clean the data
data = load_and_clean_all_data(data_dir)

# Display a sample of the datasets
for key, df in data.items():
    print(f"Sample data from {key}:")
    display(df.head())


Loading data_imdb\title.basics.tsv...
Loading data_imdb\title.akas.tsv...
Loading data_imdb\title.ratings.tsv...
Sample data from basics:


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


Sample data from akas:


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,,,original,,1
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita,US,,imdbDisplay,,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
4,tt0000001,5,Καρμενσίτα,GR,,imdbDisplay,,0


Sample data from ratings:


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2022
3,tt0000004,5.4,179
4,tt0000005,6.2,2787


# Task 1 - Quality of movies by country

In [3]:
top_orders = [10, 20, 50, 100]

# Analyze the quality of movies by country
country_counts = quality_of_movies_by_country(data, top_orders)

# Display country appearances in top N sequences
print(country_counts)
country_counts

Merged dataset size: (5677726, 19)
Filtered dataset size: (2424746, 19)
Size of movies_df after country assignment: (310812, 21)
            tconst              primaryTitle             originalTitle  \
774848   tt0111161  The Shawshank Redemption  The Shawshank Redemption   
1268319  tt0468569           The Dark Knight           The Dark Knight   
1568611  tt1375666                 Inception                 Inception   
876166   tt0137523                Fight Club                Fight Club   
765487   tt0109830              Forrest Gump              Forrest Gump   

         averageRating country  composite_score  
774848             9.3      IS        871760.31  
1268319            9.0      ID        866137.50  
1568611            8.8      GB        769596.16  
876166             8.8      FI        701755.36  
765487             8.8      DK        681472.06  
{10: {'FI': 2, 'GB': 2, 'ID': 1, 'IS': 1, 'DK': 1, 'EE': 1, 'HK': 1, 'NZ': 1}, 20: {'IN': 4, 'GB': 3, 'FI': 2, 'ID': 2, 'NZ': 

{10: {'FI': 2, 'GB': 2, 'ID': 1, 'IS': 1, 'DK': 1, 'EE': 1, 'HK': 1, 'NZ': 1},
 20: {'IN': 4,
  'GB': 3,
  'FI': 2,
  'ID': 2,
  'NZ': 2,
  'IS': 1,
  'EE': 1,
  'DK': 1,
  'HK': 1,
  'IT': 1,
  'EG': 1,
  'PH': 1},
 50: {'IN': 12,
  'GB': 5,
  'NL': 4,
  'IT': 3,
  'DK': 2,
  'ID': 2,
  'EG': 2,
  'EE': 2,
  'FI': 2,
  'NZ': 2,
  'ES': 2,
  'IL': 2,
  'IS': 1,
  'HK': 1,
  'UY': 1,
  'PH': 1,
  'CL': 1,
  'CM': 1,
  'CZ': 1,
  'JP': 1,
  'EC': 1,
  'FR': 1},
 100: {'IN': 24,
  'NL': 7,
  'ID': 7,
  'GB': 6,
  'EG': 5,
  'IL': 4,
  'DK': 4,
  'IE': 4,
  'FR': 4,
  'PH': 3,
  'IT': 3,
  'FI': 3,
  'ES': 3,
  'HK': 3,
  'NZ': 2,
  'EE': 2,
  'JP': 2,
  'UY': 1,
  'IS': 1,
  'CZ': 1,
  'CM': 1,
  'EC': 1,
  'CL': 1,
  'CO': 1,
  'KR': 1,
  'GR': 1,
  'AR': 1,
  'SE': 1,
  'CH': 1,
  'SG': 1,
  'DE': 1}}

In [4]:
def get_movie_country(title_akas):
    """
    Establish the country of origin for each movie, ensuring rows with isOriginalTitle = 1 have a region code.
    
    Parameters:
    title_akas (pd.DataFrame): DataFrame containing title.akas data.
    
    Returns:
    pd.DataFrame: DataFrame with columns ['titleId', 'country'].
    """
    # Filter to original titles
    original_titles = title_akas[title_akas['isOriginalTitle'] == 1][['titleId', 'region', 'isOriginalTitle']]
    print(original_titles.head())
    # Find missing regions
    missing_regions = original_titles['region'].isna()
    print(missing_regions.head())
    
    # Fill missing regions by looking at other rows with the same titleId
    if missing_regions.any():
        # Create a DataFrame with titleId and non-missing region
        non_missing_regions = title_akas[['titleId', 'region']].dropna().drop_duplicates('titleId')
        
        # Merge to fill missing regions in original titles
        original_titles = original_titles.merge(non_missing_regions, on='titleId', how='left', suffixes=('', '_fill'))
        
        # Use the filled regions where necessary
        original_titles['region'] = original_titles['region'].combine_first(original_titles['region_fill'])
        
        # Drop the temporary fill column
        original_titles.drop(columns=['region_fill'], inplace=True)
    
    # Filter out rows with missing regions
    original_titles = original_titles.dropna(subset=['region'])
    
    # Rename 'region' column to 'country'
    original_titles.rename(columns={'region': 'country'}, inplace=True)
    
    return original_titles[['titleId', 'country']]
