# Imports

In [1]:
import pandas as pd
import os
from functions.utilities import load_data, clean_data
from functions.task1_functions import quality_of_movies_by_country
from functions.task2_functions import total_votes_by_country, average_composite_score_by_country, weighted_average_composite_score_by_country, filter_countries_with_reference, get_countries_and_clean_orders, calculate_gdp_per_population, rename_and_add_rank, compute_hegemony
from functions.task3_functions import prepare_movies_directors, rank_directors, custom_ranking

# Data loading

In [2]:
# Define the data directory and output directory
data_dir = 'data_imdb'
output_dir = 'cleaned_data' # ??????

# Load and clean the data
basics = clean_data(load_data(os.path.join(data_dir, 'title.basics.tsv')))
akas = clean_data(load_data(os.path.join(data_dir, 'title.akas.tsv')))
ratings = clean_data(load_data(os.path.join(data_dir, 'title.ratings.tsv')))

Loading data from: data_imdb\title.basics.tsv ...
Loading data from: data_imdb\title.akas.tsv ...
Loading data from: data_imdb\title.ratings.tsv ...


In [3]:
print("Sample data from basics:")
basics.head()

Sample data from basics:


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [4]:
print("Sample data from akas:")
akas.head()

Sample data from akas:


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,,,original,,1
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita,US,,imdbDisplay,,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
4,tt0000001,5,Καρμενσίτα,GR,,imdbDisplay,,0


In [5]:
print("Sample data from ratings:")
ratings.head()

Sample data from ratings:


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2058
1,tt0000002,5.7,276
2,tt0000003,6.5,2022
3,tt0000004,5.4,179
4,tt0000005,6.2,2787


# Task 1 - Quality of movies by country

In [6]:
top_orders = [10, 20, 50, 100]

# Analyze the quality of movies by country
country_counts, movies_df = quality_of_movies_by_country(basics, ratings, akas, top_orders)

# Display country appearances in top N sequences
country_counts

There are 665 movies without an assigned country.


{10: {'GB': 2,
  'IS': 1,
  'ID': 1,
  'FI': 1,
  'DK': 1,
  'EE': 1,
  'AE': 1,
  'HK': 1,
  'NZ': 1},
 20: {'IN': 4,
  'GB': 3,
  'AE': 2,
  'ID': 2,
  'NZ': 2,
  'IS': 1,
  'FI': 1,
  'DK': 1,
  'HK': 1,
  'EE': 1,
  'EG': 1,
  'PH': 1},
 50: {'IN': 12,
  'GB': 5,
  'AE': 4,
  'NL': 4,
  'IL': 2,
  'IT': 2,
  'ID': 2,
  'EE': 2,
  'EG': 2,
  'NZ': 2,
  'PH': 2,
  'DK': 2,
  'ES': 2,
  'IS': 1,
  'FI': 1,
  'HK': 1,
  'CM': 1,
  'CZ': 1,
  'EC': 1,
  'FR': 1},
 100: {'IN': 24,
  'ID': 7,
  'AE': 7,
  'NL': 7,
  'GB': 6,
  'EG': 5,
  'IL': 4,
  'PH': 4,
  'IE': 4,
  'FR': 4,
  'DK': 3,
  'HK': 3,
  'ES': 3,
  'IT': 2,
  'NZ': 2,
  'EE': 2,
  'EC': 2,
  'FI': 1,
  'IS': 1,
  'CZ': 1,
  'CM': 1,
  'CO': 1,
  'KR': 1,
  'GR': 1,
  'AU': 1,
  'JP': 1,
  'SG': 1,
  'DE': 1}}

In [7]:
print(movies_df.shape)
movies_df.head(20)

(310812, 22)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,...,ordering,title,region,language,types,attributes,isOriginalTitle,titleId_y,country,composite_score
774848,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994,,142,Drama,9.3,...,1,The Shawshank Redemption,,,original,,1,tt0111161,IS,871760.31
1268319,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,,152,"Action,Crime,Drama",9.0,...,1,The Dark Knight,,,original,,1,tt0468569,ID,866137.5
1568611,tt1375666,movie,Inception,Inception,0,2010,,148,"Action,Adventure,Sci-Fi",8.8,...,1,Inception,,,original,,1,tt1375666,GB,769596.16
876166,tt0137523,movie,Fight Club,Fight Club,0,1999,,139,Drama,8.8,...,1,Fight Club,,,original,,1,tt0137523,FI,701755.36
765487,tt0109830,movie,Forrest Gump,Forrest Gump,0,1994,,142,"Drama,Romance",8.8,...,1,Forrest Gump,,,original,,1,tt0109830,DK,681472.06
772965,tt0110912,movie,Pulp Fiction,Pulp Fiction,0,1994,,154,"Crime,Drama",8.9,...,1,Pulp Fiction,,,original,,1,tt0110912,EE,670144.43
1310657,tt0816692,movie,Interstellar,Interstellar,0,2014,,169,"Adventure,Drama,Sci-Fi",8.7,...,1,Interstellar,,,original,,1,tt0816692,GB,635870.19
868149,tt0133093,movie,The Matrix,The Matrix,0,1999,,136,"Action,Sci-Fi",8.7,...,1,The Matrix,,,original,,1,tt0133093,AE,619520.79
444363,tt0068646,movie,The Godfather,The Godfather,0,1972,,175,"Crime,Drama",9.2,...,1,The Godfather,,,original,,1,tt0068646,HK,607480.04
838707,tt0120737,movie,The Lord of the Rings: The Fellowship of the Ring,The Lord of the Rings: The Fellowship of the Ring,0,2001,,178,"Action,Adventure,Drama",8.9,...,1,The Lord of the Rings: The Fellowship of the Ring,,,original,,1,tt0120737,NZ,605482.43


# Task 2 - "Cinematic Impact" Hegemony

In [8]:
# weak cinematic impact
votes_df = total_votes_by_country(movies_df)

# strong cinematic impact
avg_score_df = average_composite_score_by_country(movies_df)

# strong cinematic impact v2
avg_wgt_score_df = weighted_average_composite_score_by_country(movies_df)

In [9]:
# Introducing dataset with codenames for countries (TODO)
country_codes_df = pd.read_csv('data_gdp_population/country_codes_all.csv')

In [10]:
votes_df, excluded_countries = get_countries_and_clean_orders(
    votes_df, country_codes_df, 'country', 'alpha-2', ['name', 'number of votes'])

avg_score_df, _ = get_countries_and_clean_orders(
    avg_score_df, country_codes_df, 'country', 'alpha-2',['name', 'average composite score'])

avg_wgt_score_df, _ = get_countries_and_clean_orders(
    avg_wgt_score_df, country_codes_df, 'country', 'alpha-2', ['name', 'weighted average composite score'])

print(f"Country codes excluded from further analysis:\n{excluded_countries}")

Country codes excluded from further analysis:
['XWW', 'XWG', 'CSHH', 'XYU', 'DDDE', 'XEU', 'SUHH', 'YUCS', 'CSXX', 'XAS', 'XKV', 'AN', 'XKO', 'BUMM', 'VDVN', 'ZRCD', 'XSI', 'XPI']


In [11]:
votes_df

Unnamed: 0,name,number of votes
0,India,206215661
1,Australia,94268154
2,United Kingdom of Great Britain and Northern I...,84405730
3,Indonesia,58336390
4,"Netherlands, Kingdom of the",48650006
...,...,...
212,Botswana,10
213,Seychelles,9
214,Gambia,8
215,Comoros,8


In [12]:
avg_wgt_score_df

Unnamed: 0,name,weighted average composite score
0,Iceland,806889.141677
1,Estonia,527363.572450
2,Cameroon,382709.202256
3,New Zealand,241750.755855
4,Denmark,225768.037646
...,...,...
212,Solomon Islands,6.034706
213,Equatorial Guinea,5.813333
214,Comoros,5.410000
215,Seychelles,5.220000


In [13]:
# Source (TODO): https://data.worldbank.org/indicator/NY.GDP.MKTP.CD?end=2023&name_desc=false&skipRedirection=true&start=1960&view=chart
data_dir = 'data_gdp_population'

# Load the data
gdp_df = clean_data(load_data(os.path.join(data_dir, 'API_NY.GDP.MKTP.CD_DS2_en_csv_v2_580250.csv'), header=2))
pop_df = clean_data(load_data(os.path.join(data_dir, 'API_SP.POP.TOTL_DS2_en_csv_v2_580248.csv'), header=2))

Loading data from: data_gdp_population\API_NY.GDP.MKTP.CD_DS2_en_csv_v2_580250.csv ...
Loading data from: data_gdp_population\API_SP.POP.TOTL_DS2_en_csv_v2_580248.csv ...


In [14]:
gdp_df = filter_countries_with_reference(gdp_df, 'Country Code', country_codes_df, 'alpha-3')
gdp_df

Unnamed: 0,Country Name,2023
251,United States,2.736094e+13
40,China,1.779478e+13
55,Germany,4.456081e+12
119,Japan,4.212945e+12
109,India,3.549919e+12
...,...,...
239,Tonga,
254,"Venezuela, RB",
255,British Virgin Islands,
256,Virgin Islands (U.S.),


In [15]:
pop_df = filter_countries_with_reference(pop_df, 'Country Code', country_codes_df, 'alpha-3')
pop_df

Unnamed: 0,Country Name,2023
109,India,1.428628e+09
40,China,1.410710e+09
251,United States,3.349149e+08
106,Indonesia,2.775341e+08
184,Pakistan,2.404857e+08
...,...,...
147,St. Martin (French part),3.207700e+04
255,British Virgin Islands,3.153800e+04
188,Palau,1.805800e+04
179,Nauru,1.278000e+04


In [16]:
gdp_pop_df = calculate_gdp_per_population(gdp_df, pop_df)
gdp_pop_df

Unnamed: 0,index,gdp_per_population
70,Luxembourg,128259.402583
24,Ireland,103684.880802
19,Switzerland,99994.938020
30,Norway,87961.780614
29,Singapore,84734.255921
...,...,...
210,Tonga,
211,"Venezuela, RB",
212,British Virgin Islands,
213,Virgin Islands (U.S.),


In [17]:
votes_df = rename_and_add_rank(votes_df, ['country', 'number of votes'])
avg_score_df = rename_and_add_rank(avg_score_df, ['country', 'average score'])
avg_wgt_score_df = rename_and_add_rank(avg_wgt_score_df, ['country', 'weighted average score'])

gdp_df = rename_and_add_rank(gdp_df, ['country', 'gdp'])
pop_df = rename_and_add_rank(pop_df, ['country', 'population'])
gdp_pop_df = rename_and_add_rank(gdp_pop_df, ['country', 'gdp/population'])

In [18]:
gdp_score_hegemony = compute_hegemony(gdp_df, avg_score_df, 'gdp', 'avg_score')
pop_votes_hegemony = compute_hegemony(pop_df, votes_df, 'pop', 'votes')
gdp_pop_wgt_score_hegemony = compute_hegemony(gdp_pop_df, avg_wgt_score_df, 'gdp_pop', 'wgt_score')



Gdp / Avg_score Hegemony Rankings:
1. Mongolia (Hegemony Score: 0)
2. Slovenia (Hegemony Score: 0)
3. Thailand (Hegemony Score: 1)
4. Albania (Hegemony Score: 1)
5. Marshall Islands (Hegemony Score: 1)
6. Mali (Hegemony Score: 2)
7. Grenada (Hegemony Score: 2)
8. Portugal (Hegemony Score: 3)
9. Myanmar (Hegemony Score: 3)
10. Bermuda (Hegemony Score: 3)
11. Sao Tome and Principe (Hegemony Score: 4)
12. Eritrea (Hegemony Score: 4)
13. Sweden (Hegemony Score: 4)
14. Cyprus (Hegemony Score: 5)
15. Antigua and Barbuda (Hegemony Score: 5)
16. Nepal (Hegemony Score: 6)
17. Azerbaijan (Hegemony Score: 6)
18. Norway (Hegemony Score: 6)
19. Belize (Hegemony Score: 7)
20. American Samoa (Hegemony Score: 7)
21. Belarus (Hegemony Score: 8)
22. Nicaragua (Hegemony Score: 9)
23. Romania (Hegemony Score: 10)
24. Guam (Hegemony Score: 10)
25. Armenia (Hegemony Score: 10)
26. Denmark (Hegemony Score: 10)
27. Bulgaria (Hegemony Score: 11)
28. Burkina Faso (Hegemony Score: 11)
29. Cabo Verde (Hegemony S

# Task 3 - Focus on Directors

In [19]:
data_dir = 'data_imdb'

crew = clean_data(load_data(os.path.join(data_dir, 'title.crew.tsv')))
names = clean_data(load_data(os.path.join(data_dir, 'name.basics.tsv')))

Loading data from: data_imdb\title.crew.tsv ...
Loading data from: data_imdb\name.basics.tsv ...


In [20]:
movies_directors_df = prepare_movies_directors(crew, names, movies_df)
movies_directors_df

Unnamed: 0,tconst,directors,writers,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,titleType,...,ordering,title,region,language,types,attributes,isOriginalTitle,titleId_y,country,composite_score
0,tt0111161,nm0001104,"nm0000175,nm0001104",nm0001104,Frank Darabont,1959,,"writer,producer,director","tt0120689,tt0111161,tt0884328,tt1520211",movie,...,1,The Shawshank Redemption,,,original,,1,tt0111161,IS,871760.31
1,tt0468569,nm0634240,"nm0634300,nm0634240,nm0275286,nm0004170",nm0634240,Christopher Nolan,1970,,"writer,producer,director","tt6723592,tt0816692,tt1375666,tt0482571",movie,...,1,The Dark Knight,,,original,,1,tt0468569,ID,866137.50
2,tt1375666,nm0634240,nm0634240,nm0634240,Christopher Nolan,1970,,"writer,producer,director","tt6723592,tt0816692,tt1375666,tt0482571",movie,...,1,Inception,,,original,,1,tt1375666,GB,769596.16
3,tt0137523,nm0000399,"nm0657333,nm0880243",nm0000399,David Fincher,1962,,"producer,director,writer","tt0114369,tt2267998,tt0443706,tt1285016",movie,...,1,Fight Club,,,original,,1,tt0137523,FI,701755.36
4,tt0109830,nm0000709,"nm0343165,nm0744839",nm0000709,Robert Zemeckis,1952,,"producer,writer,director","tt0088763,tt0109830,tt0118884,tt1907668",movie,...,1,Forrest Gump,,,original,,1,tt0109830,DK,681472.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310807,tt31156932,nm15794661,nm15794661,nm15794661,Mohammad Hossein Haghighat,,,"director,writer",tt31156932,movie,...,1,Parvaze 175,,,original,,1,tt31156932,IR,2.50
310808,tt30798708,nm15690913,nm15690913,nm15690913,Mohammad Mehdi Choopani,,,"director,writer",tt30798708,movie,...,1,Arezoohayat Ra Be Khak Naspar,,,original,,1,tt30798708,IR,2.20
310809,tt27833215,nm14534816,nm14534816,nm14534816,Rein Alpajora,2009,,"actor,writer,editorial_department","tt26629920,tt26660659,tt31797757,tt27997441",movie,...,1,The Doll: Creation,,,original,,1,tt27833215,PH,2.20
310810,tt30625407,nm14782664,,nm14782664,Seyyed Amin Kazemzadeh,,,"director,actor","tt30625407,tt32167147,tt27580522",movie,...,1,Borde Sagi,,,original,,1,tt30625407,IR,2.20


In [35]:
rank_directors(movies_directors_df, 'primaryName', 'composite_score', 'mean')

Unnamed: 0,primaryName,aggregated_score,total_movies,rank
0,Christopher Nolan,407315.641667,12,1.0
1,Frank Darabont,354156.782500,4,2.0
2,John Lasseter,322166.810000,1,3.0
3,Quentin Tarantino,271842.562500,12,4.0
4,Lee Unkrich,268908.110000,1,5.0
...,...,...,...,...
104506,Mohammad Hossein Haghighat,2.500000,1,104507.0
104507,Abdolreza Nematollahi,2.500000,1,104507.0
104508,Mohammad Mehdi Choopani,2.200000,1,104509.0
104509,Rein Alpajora,2.200000,1,104509.0


In [36]:
rank_directors(movies_directors_df, 'primaryName', 'composite_score', 'sum')    

Unnamed: 0,primaryName,aggregated_score,total_movies,rank
0,Christopher Nolan,4887787.70,12,1.0
1,Steven Spielberg,4371850.41,34,2.0
2,Quentin Tarantino,3262110.75,12,3.0
3,Martin Scorsese,3193269.69,36,4.0
4,David Fincher,2731575.10,12,5.0
...,...,...,...,...
104506,Abdolreza Nematollahi,2.50,1,104507.0
104507,Mohammad Hossein Haghighat,2.50,1,104507.0
104508,Mohammad Mehdi Choopani,2.20,1,104509.0
104509,Seyyed Amin Kazemzadeh,2.20,1,104509.0


In [59]:
custom_ranking(movies_directors_df, 'primaryName', 'composite_score', good_threshold=7.0, bad_threshold=3.0)

Unnamed: 0,primaryName,custom_score,total_movies,rank
0,Christopher Nolan,63.525089,12,1.0
1,Frank Darabont,20.122436,4,2.0
2,Quentin Tarantino,18.051022,12,3.0
3,David Fincher,9.358073,12,4.0
4,John Lasseter,1.391147,1,5.0
...,...,...,...,...
104506,Gilberto Martínez Solares,-419.969170,140,104507.0
104507,William Beaudine,-422.856006,141,104508.0
104508,Michael Curtiz,-461.841225,155,104509.0
104509,Jesús Franco,-464.695353,155,104510.0
