# Imports

In [1]:
import os
from utilities import load_data, clean_data
from task1_functions import quality_of_movies_by_country
from task2_functions import total_votes_by_country, average_composite_score_by_country, weighted_average_composite_score_by_country

# Data loading

In [2]:
# Define the data directory and output directory
data_dir = 'data_imdb'
output_dir = 'cleaned_data'

# Load and clean the data
basics = clean_data(load_data(os.path.join(data_dir, 'title.basics.tsv')))
akas = clean_data(load_data(os.path.join(data_dir, 'title.akas.tsv')))
ratings = clean_data(load_data(os.path.join(data_dir, 'title.ratings.tsv')))

Loading data from: data_imdb\title.basics.tsv ...
Loading data from: data_imdb\title.akas.tsv ...
Loading data from: data_imdb\title.ratings.tsv ...


In [3]:
print("Sample data from basics:")
basics.head()

Sample data from basics:


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [4]:
print("Sample data from akas:")
display(akas.head())

Sample data from akas:


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,,,original,,1
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita,US,,imdbDisplay,,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
4,tt0000001,5,Καρμενσίτα,GR,,imdbDisplay,,0


In [5]:
print("Sample data from ratings:")
display(ratings.head())

Sample data from ratings:


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2022
3,tt0000004,5.4,179
4,tt0000005,6.2,2787


# Task 1 - Quality of movies by country

In [6]:
top_orders = [10, 20, 50, 100]

# Analyze the quality of movies by country
country_counts, movies_df = quality_of_movies_by_country(basics, ratings, akas, top_orders)

# Display country appearances in top N sequences
country_counts

There are 665 movies without an assigned country.


{10: {'GB': 2,
  'IS': 1,
  'ID': 1,
  'FI': 1,
  'DK': 1,
  'EE': 1,
  'AE': 1,
  'HK': 1,
  'NZ': 1},
 20: {'IN': 4,
  'GB': 3,
  'AE': 2,
  'ID': 2,
  'NZ': 2,
  'IS': 1,
  'FI': 1,
  'DK': 1,
  'HK': 1,
  'EE': 1,
  'EG': 1,
  'PH': 1},
 50: {'IN': 12,
  'GB': 5,
  'AE': 4,
  'NL': 4,
  'IL': 2,
  'IT': 2,
  'ID': 2,
  'EE': 2,
  'EG': 2,
  'NZ': 2,
  'PH': 2,
  'DK': 2,
  'ES': 2,
  'IS': 1,
  'FI': 1,
  'HK': 1,
  'CM': 1,
  'CZ': 1,
  'EC': 1,
  'FR': 1},
 100: {'IN': 24,
  'ID': 7,
  'AE': 7,
  'NL': 7,
  'GB': 6,
  'EG': 5,
  'IL': 4,
  'PH': 4,
  'IE': 4,
  'FR': 4,
  'DK': 3,
  'HK': 3,
  'ES': 3,
  'IT': 2,
  'NZ': 2,
  'EE': 2,
  'EC': 2,
  'FI': 1,
  'IS': 1,
  'CZ': 1,
  'CM': 1,
  'CO': 1,
  'KR': 1,
  'GR': 1,
  'AU': 1,
  'JP': 1,
  'SG': 1,
  'DE': 1}}

In [7]:
print(movies_df.shape)
movies_df.head(100)

(310812, 22)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,...,ordering,title,region,language,types,attributes,isOriginalTitle,titleId_y,country,composite_score
774848,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994,,142,Drama,9.3,...,1,The Shawshank Redemption,,,original,,1,tt0111161,IS,871760.31
1268319,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",9.0,...,1,The Dark Knight,,,original,,1,tt0468569,ID,866137.50
1568611,tt1375666,movie,Inception,Inception,0,2010,,148,"Action,Adventure,Sci-Fi",8.8,...,1,Inception,,,original,,1,tt1375666,GB,769596.16
876166,tt0137523,movie,Fight Club,Fight Club,0,1999,,139,Drama,8.8,...,1,Fight Club,,,original,,1,tt0137523,FI,701755.36
765487,tt0109830,movie,Forrest Gump,Forrest Gump,0,1994,,142,"Drama,Romance",8.8,...,1,Forrest Gump,,,original,,1,tt0109830,DK,681472.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427265,tt0066921,movie,A Clockwork Orange,A Clockwork Orange,0,1971,,136,"Crime,Sci-Fi",8.3,...,1,A Clockwork Orange,,,original,,1,tt0066921,NL,265386.71
1353298,tt1010048,movie,Slumdog Millionaire,Slumdog Millionaire,0,2008,,120,"Crime,Drama,Romance",8.0,...,1,Slumdog Millionaire,,,original,,1,tt1010048,IN,264066.20
1708298,tt1663202,movie,The Revenant,The Revenant,0,2015,,156,"Action,Adventure,Drama",8.0,...,1,The Revenant,,,original,,1,tt1663202,IN,263716.70
1450252,tt1160419,movie,Dune,Dune,0,2021,,155,"Action,Adventure,Drama",8.0,...,1,Dune,,,original,,1,tt1160419,EG,263394.50


# Task 2

In [8]:
total_votes_by_country(movies_df)

country
IN    206215661
AU     94268154
GB     84405730
ID     58336390
NL     48650006
        ...    
BW           10
SC            9
GM            8
KM            8
LY            8
Name: numVotes, Length: 217, dtype: int64