In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read the basic data frame of IMDb
df = pd.read_csv("./imdb_data/title_basics/data.tsv", sep="\t")
print("overall: ",len(df.index))

# Sort out any non-movies (e.g tv-shows)
df = df[df["titleType"]=="movie"]
print("Number of movies:\t",len(df.index))

# Read the review data frame of IMDb
df_reviews = pd.read_csv("./imdb_data/title_ratings/data.tsv", sep="\t")

df["tconst"] = df["tconst"].astype(str)
df_reviews["tconst"] = df_reviews["tconst"].astype(str)

# inner merge of movies and ratings (movies without any votes are dropped)
df = df.merge(df_reviews, how="inner", on="tconst")
print("Number of movies with rating:\t",len(df.index))

# Free up some memory
del df_reviews

# sort out all movies which have less than 100 votes
df = df[df["numVotes"]>=100]
print("Number of movies at least 100 ratings:\t",df.shape[0])

# Read in our scraped data
df_scrape = pd.read_csv("./tconst_scraped_data.csv")
print("Number scraped movies:\t", len(df_scrape))

## Hard Coding

# Change Movie ID "tt11905872" in the basic DF to the new id "tt4131756"
# These are the same movie. The the basic IMDb data set has an old tconst
df["tconst"] = df["tconst"].replace(["tt11905872"],"tt4131756")

# Drop Movie "tt4131756" because it's no longer available
# Movie not available (404 Error)
df = df[df.tconst != "tt7368158"]
print("Number of movies at least 100 ratings after dropping:\t",df.shape[0])

# Movie "tt1027755" does not have start year but was realeased in 2012
df.iloc[62178, 5] = "2012"


df["tconst"] = df["tconst"].astype(str)
df_scrape["tconst"] = df_scrape["tconst"].astype(str)

# Merge the data frame and the scraped content
df = df.merge(df_scrape, how="inner", on="tconst")
print("Number of movies after merge:\t",df.shape[0])

# Free up some memory
del df_scrape

# Read the review data frame of IMDb
df_crew = pd.read_csv("./imdb_data/title_crew/data.tsv", sep="\t")

df["tconst"] = df["tconst"].astype(str)
df_crew["tconst"] = df_crew["tconst"].astype(str)

# inner merge of movies and ratings (movies without any votes are dropped)
df = df.merge(df_crew, how="inner", on="tconst")
print("Number of movies after crew merge:\t",len(df))

# Free up some memory
del df_crew

# Sort according to tconst
df = df.sort_values("tconst")


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


overall:  8598896
Number of movies:	 598851
Number of movies with rating:	 273557
Number of movies at least 100 ratings:	 110156
Number scraped movies:	 110155


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Number of movies at least 100 ratings after dropping:	 110155
Number of movies after merge:	 110155
Number of movies after crew merge:	 110155


In [66]:
budget = df["Budget"].to_numpy()
import re

currencies = []

counter = 0

for i in range(len(budget)):
    string = budget[i]
    
    if not pd.isna(string):

        number = "".join(re.findall(r'[\d]+', string))
        currency = re.findall(r'[^{\d,\xa0}]+', string)[0]
        
        
        currencies.append(currency)
        
        if currency in ["$", "£", "€", "CA$", "₹", "A$"]:
            counter += 1

from collections import Counter
print(Counter(currencies))
print(counter)
        
# data for ["$", "€", "CA$", "£", "A$", ]

    

Counter({'$': 19112, '€': 3145, '₹': 1198, '£': 955, 'CA$': 787, 'A$': 320, 'SEK': 186, 'NOK': 162, 'R$': 149, 'FRF': 144, 'DEM': 139, 'FIM': 122, 'DKK': 122, 'RUR': 122, 'EGP': 96, 'IRR': 84, 'CN¥': 81, 'TRL': 81, 'HUF': 81, 'PLN': 76, '¥': 59, 'HK$': 51, 'CZK': 45, '₩': 44, 'ITL': 42, 'MX$': 42, 'ESP': 41, 'NZ$': 40, 'BDT': 37, 'MYR': 37, 'NLG': 36, 'CHF': 33, 'SGD': 33, 'IDR': 33, 'THB': 29, 'ARS': 24, 'ZAR': 22, 'ROL': 21, 'UAH': 20, 'ISK': 19, 'NT$': 19, 'PKR': 18, 'PTE': 15, 'DOP': 14, '₱': 13, '₪': 11, 'HRK': 8, 'AZM': 8, 'BEF': 7, 'NPR': 7, 'BGL': 7, 'LKR': 7, 'EEK': 6, 'RON': 6, 'COP': 5, '₫': 5, 'CLP': 4, 'VEB': 4, 'LVL': 4, 'NGN': 4, 'GEL': 4, 'SIT': 3, 'YUM': 3, 'LTL': 3, 'MNT': 3, 'ALL': 2, 'SAR': 2, 'AMD': 2, 'AED': 2, 'XAU': 1, 'PYG': 1, 'IEP': 1, 'JMD': 1, 'ATS': 1, 'BND': 1, 'GRD': 1, 'SKK': 1, 'MTL': 1, 'IQD': 1, 'TTD': 1, 'JOD': 1, 'ZWD': 1, 'MVR': 1})
25517


In [61]:
from currency_converter import CurrencyConverter
from datetime import date # datetime works too

c = CurrencyConverter(fallback_on_wrong_date=True)
c.convert(100, 'USD', date=date(1999,3,1))


91.02494083378846

In [37]:
np.unique(currencies)

array(['$', 'A$', 'AED', 'ALL', 'AMD', 'ARS', 'ATS', 'AZM', 'BDT', 'BEF',
       'BGL', 'BND', 'CA$', 'CHF', 'CLP', 'CN¥', 'COP', 'CZK', 'DEM',
       'DKK', 'DOP', 'EEK', 'EGP', 'ESP', 'FIM', 'FRF', 'GEL', 'GRD',
       'HK$', 'HRK', 'HUF', 'IDR', 'IEP', 'IQD', 'IRR', 'ISK', 'ITL',
       'JMD', 'JOD', 'LKR', 'LTL', 'LVL', 'MNT', 'MTL', 'MVR', 'MX$',
       'MYR', 'NGN', 'NLG', 'NOK', 'NPR', 'NT$', 'NZ$', 'PKR', 'PLN',
       'PTE', 'PYG', 'R$', 'ROL', 'RON', 'RUR', 'SAR', 'SEK', 'SGD',
       'SIT', 'SKK', 'THB', 'TRL', 'TTD', 'UAH', 'VEB', 'XAU', 'YUM',
       'ZAR', 'ZWD', '£', '¥', '₩', '₪', '₫', '€', '₱', '₹'], dtype='<U3')

In [17]:
display(HTML(df_budget[df_budget["primaryTitle"]=="Up"].to_html()))

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,Budget,Gross US & Canada,Opening weekend US & Canada,Gross worldwide,Rating,User reviews,Critic reviews,directors,writers
62903,tt1049413,movie,Up,Up,0,2009,\N,96,"Adventure,Animation,Comedy",8.2,993395,"$175,000,000 (estimated)","$293,004,164","$68,108,790","$735,099,102",PG,1K,408.0,"nm0230032,nm0677037","nm0230032,nm0677037,nm0565336"


In [33]:
print("\xa0")

 


In [4]:
# ours 797_615_189.873417
# infl 797_701_090.78

In [4]:
df.to_csv("raw_data.csv", index=False)

In [5]:
c.currencies

NameError: name 'c' is not defined