In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

from dvc.api import make_checkpoint

# Display full outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

### Data Transformation

In [19]:
# Read film data
df_film = spark.read.parquet("/project/Individual/parquet_files/origin.parquet").toPandas()
make_checkpoint()

In [20]:
# Order and Index films by place
df_film = df_film.sort_values(by='place')
df_film = df_film.reset_index(drop=True)
make_checkpoint()

In [21]:
# A glance at the duration column
df_film['duration']

0      2 hr 22 min
1      2 hr 55 min
2      2 hr 32 min
3      3 hr 22 min
4      1 hr 36 min
          ...     
245    1 hr 30 min
246    2 hr 26 min
247    1 hr 24 min
248    1 hr 58 min
249     3 hr 1 min
Name: duration, Length: 250, dtype: object

In [22]:
# Change the format of unique running times
# Such as less than 1 hour (no 'hr') or exactly whole hours (no 'min')
less_1hr = df_film[df_film['duration'].str.contains('hr')== False]
less_1hr

Unnamed: 0,place,title,year,director,cast,rating,genre,duration,domestic_k,international_k,worldwide_k,distributor
189,190,Sherlock Jr,1924,Buster Keaton,"Buster Keaton, Kathryn McGuire",8.0703,Action,45 min,0.0,0.0,0.0,0


In [23]:
whole_hrs = df_film[df_film['duration'].str.contains('min')== False]
whole_hrs

Unnamed: 0,place,title,year,director,cast,rating,genre,duration,domestic_k,international_k,worldwide_k,distributor
70,71,Oldeuboi,2003,Park Chan-wook,"Choi Min-sik, Yoo Ji-Tae",8.3201,Action,2 hr,707.481,14487.112,15194.593,Tartan
132,133,The Wolf of Wall Street,2013,Martin Scorsese,"Leonardo DiCaprio, Jonah Hill",8.1702,Biography,3 hr,116900.694,275100.0,392000.694,Paramount Pictures
201,202,Mad Max: Fury Road,2015,George Miller,"Tom Hardy, Charlize Theron",8.0611,Action,2 hr,154109.06,221600.41,375709.47,Warner Bros.
209,210,Platoon,1986,Oliver Stone,"Charlie Sheen, Tom Berenger",8.0478,Drama,2 hr,138530.565,15.067,138545.632,Orion Pictures
210,211,Rocky,1976,John G. Avildsen,"Sylvester Stallone, Talia Shire",8.0471,Drama,2 hr,117235.147,15.255,117250.402,United Artists


In [24]:
# Mannually fix the format
df_film.at[189, 'duration'] = '0 hr 45 min'
df_film.at[70, 'duration'] = '2 hr 0 min'
df_film.at[132, 'duration'] = '3 hr 0 min'
df_film.at[201, 'duration'] = '2 hr 0 min'
df_film.at[209, 'duration'] = '2 hr 0 min'
df_film.at[210, 'duration'] = '2 hr 0 min'
make_checkpoint()

In [25]:
# Clean the duration column for transformation
df_film['duration'] = df_film['duration'].str.replace(' ', '')
df_film['duration'] = df_film['duration'].str.replace('min', '')
df_film['duration'] = df_film['duration'].str.replace('hr', ':')
make_checkpoint()

In [26]:
df_film['duration']

0      2:22
1      2:55
2      2:32
3      3:22
4      1:36
       ... 
245    1:30
246    2:26
247    1:24
248    1:58
249     3:1
Name: duration, Length: 250, dtype: object

In [27]:
# Convert duration into minutes
df_film['duration'] = df_film['duration'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))
make_checkpoint()

In [28]:
# Calculate the percentage of domestic and international box office income
df_film['dom_pct'] = df_film['domestic_k']/df_film['worldwide_k']
df_film['int_pct'] = df_film['international_k']/df_film['worldwide_k']
df_film['dom_pct'] = df_film['dom_pct'].round(decimals = 4)
df_film['int_pct'] = df_film['int_pct'].round(decimals = 4)
make_checkpoint()

In [29]:
# Check the realistic feasibility
df_film['check'] = df_film['dom_pct'] + df_film['int_pct']
df_film['check'] = df_film['check'].fillna(0)
make_checkpoint()

# Find the rows that do not make sense in real life
df_film.loc[(df_film['check'] != 1.0000) & (df_film['check'] != 0)]

Unnamed: 0,place,title,year,director,cast,rating,genre,duration,domestic_k,international_k,worldwide_k,distributor,dom_pct,int_pct,check
14,15,The Empire Strikes Back,1980,Irvin Kershner,"Mark Hamill, Harrison Ford",8.7019,Action,124,292753.96,190685.234,538375.067,Twentieth Century Fox,0.5438,0.3542,0.898
26,27,Star Wars,1977,George Lucas,"Mark Hamill, Harrison Ford",8.5551,Action,121,460998.507,195751.992,775398.007,Twentieth Century Fox,0.5945,0.2525,0.847
28,29,Terminator 2: Judgment Day,1991,James Cameron,"Arnold Schwarzenegger, Linda Hamilton",8.5345,Action,137,205881.154,312106.698,520881.154,TriStar Pictures,0.3953,0.5992,0.9945
38,39,The Usual Suspects,1995,Bryan Singer,"Kevin Spacey, Gabriel Byrne",8.478,Crime,106,23341.568,23341.568,645.363,Gramercy Pictures (I),36.1681,36.1681,72.3362
47,48,Hotaru no haka,1988,Isao Takahata,"Tsutomu Tatsumi, Ayano Shiraishi",8.4461,Animation,89,516.962,516.962,158.101,Fathom Events,3.2698,3.2698,6.5396
74,75,Das Boot,1981,Wolfgang Petersen,"Jürgen Prochnow, Herbert Grönemeyer",8.3123,Drama,149,11487.676,11487.676,26.994,Columbia Pictures,425.5641,425.5641,851.1282
85,86,Star Wars: Episode VI - Return of the Jedi,1983,Richard Marquand,"Mark Hamill, Harrison Ford",8.2687,Action,131,309306.177,122009.457,475106.177,Twentieth Century Fox,0.651,0.2568,0.9078
102,103,Full Metal Jacket,1987,Stanley Kubrick,"Matthew Modine, R. Lee Ermey",8.2407,Drama,116,46357.676,46357.676,2217.307,Warner Bros.,20.9072,20.9072,41.8144
158,159,"Lock, Stock and Two Smoking Barrels",1998,Guy Ritchie,"Jason Flemyng, Dexter Fletcher",8.1244,Action,107,3753.929,3753.929,143.321,Gramercy Pictures (I),26.1925,26.1925,52.385
162,163,The Bridge on the River Kwai,1957,David Lean,"William Holden, Alec Guinness",8.1157,Adventure,161,27200.0,27200.0,3000.0,Columbia Pictures,9.0667,9.0667,18.1334


In [30]:
# Mannually fix the films that do not have international incomes
df_film.at[38, 'international_k'] = 0.000
df_film.at[38, 'worldwide_k'] = 23341.568
df_film.at[47, 'international_k'] = 0.000
df_film.at[47, 'worldwide_k'] = 516.962
df_film.at[74, 'international_k'] = 0.000
df_film.at[74, 'worldwide_k'] = 11487.676
df_film.at[102, 'international_k'] = 0.000
df_film.at[102, 'worldwide_k'] = 46357.676
df_film.at[158, 'international_k'] = 0.000
df_film.at[158, 'worldwide_k'] = 3753.929
df_film.at[162, 'international_k'] = 0.000
df_film.at[162, 'worldwide_k'] = 27200.000
df_film.at[177, 'international_k'] = 0.000
df_film.at[177, 'worldwide_k'] = 933.933
df_film.at[187, 'international_k'] = 0.000
df_film.at[187, 'worldwide_k'] = 144.738
df_film.at[204, 'international_k'] = 0.000
df_film.at[204, 'worldwide_k'] = 21.877
df_film.at[213, 'international_k'] = 0.000
df_film.at[213, 'worldwide_k'] = 52287.414
df_film.at[244, 'international_k'] = 0.000
df_film.at[244, 'worldwide_k'] = 52767.889
make_checkpoint()

In [31]:
# Find other missing box office incomes data
df_film.loc[(df_film['domestic_k'] == 0) & (df_film['international_k'] == 0) & (df_film['worldwide_k'] == 0)]

Unnamed: 0,place,title,year,director,cast,rating,genre,duration,domestic_k,international_k,worldwide_k,distributor,dom_pct,int_pct,check
4,5,12 Angry Men,1957,Sidney Lumet,"Henry Fonda, Lee J. Cobb",8.9464,Crime,96,0.0,0.0,0.0,United Artists,,,0.0
46,47,Seppuku,1962,Masaki Kobayashi,"Tatsuya Nakadai, Akira Ishihama",8.4485,Action,133,0.0,0.0,0.0,0,,,0.0
59,60,Paths of Glory,1957,Stanley Kubrick,"Kirk Douglas, Ralph Meeker",8.3712,Drama,88,0.0,0.0,0.0,United Artists,,,0.0
61,62,The Great Dictator,1940,Charles Chaplin,"Charles Chaplin, Paulette Goddard",8.3668,Comedy,125,0.0,0.0,0.0,United Artists,,,0.0
62,63,Witness for the Prosecution,1957,Billy Wilder,"Tyrone Power, Marlene Dietrich",8.3558,Crime,116,0.0,0.0,0.0,United Artists,,,0.0
91,92,Tengoku to jigoku,1963,Akira Kurosawa,"Toshirô Mifune, Yutaka Sada",8.2617,Crime,143,0.0,0.0,0.0,Janus Films,,,0.0
95,96,M - Eine Stadt sucht einen Mörder,1931,Fritz Lang,"Peter Lorre, Ellen Widmann",8.2559,Crime,117,0.0,0.0,0.0,0,,,0.0
103,104,Double Indemnity,1944,Billy Wilder,"Fred MacMurray, Barbara Stanwyck",8.2383,Crime,107,0.0,0.0,0.0,Paramount Pictures,,,0.0
108,109,The Sting,1973,George Roy Hill,"Paul Newman, Robert Redford",8.2292,Comedy,129,0.0,0.0,0.0,Universal Pictures,,,0.0
110,111,Hamilton,2020,Thomas Kail,"Lin-Manuel Miranda, Phillipa Soo",8.2275,Biography,160,0.0,0.0,0.0,Walt Disney Studios Motion Pictures,,,0.0


In [32]:
# Fill in missing values mannully
df_film.at[4, 'international_k'] = 0.955
df_film.at[4, 'worldwide_k'] = 0.955
df_film.at[46, 'international_k'] = 15.222
df_film.at[46, 'worldwide_k'] = 15.222
df_film.at[59, 'international_k'] = 5.252
df_film.at[59, 'worldwide_k'] = 5.252
df_film.at[61, 'international_k'] = 970.214
df_film.at[61, 'worldwide_k'] = 970.214
df_film.at[62, 'international_k'] = 7.693
df_film.at[62, 'worldwide_k'] = 7.693
df_film.at[91, 'domestic_k'] = 46.808
df_film.at[91, 'worldwide_k'] = 46.808
df_film.at[95, 'international_k'] = 90.556
df_film.at[95, 'worldwide_k'] = 90.556
df_film.at[103, 'international_k'] = 14.190
df_film.at[103, 'worldwide_k'] = 14.190
df_film.at[108, 'domestic_k'] = 156000.000
df_film.at[108, 'worldwide_k'] = 156000.000
df_film.at[124, 'domestic_k'] = 15000.000
df_film.at[124, 'worldwide_k'] = 15000.000
df_film.at[127, 'international_k'] = 41.960
df_film.at[127, 'worldwide_k'] = 41.960
df_film.at[128, 'international_k'] = 195.088
df_film.at[128, 'worldwide_k'] = 195.088
df_film.at[133, 'international_k'] = 12.180
df_film.at[133, 'worldwide_k'] = 12.180
df_film.at[144, 'domestic_k'] = 5014.000
df_film.at[144, 'worldwide_k'] = 5014.000
df_film.at[145, 'domestic_k'] = 46.808
df_film.at[145, 'worldwide_k'] = 46.808
df_film.at[146, 'international_k'] = 228.178
df_film.at[146, 'worldwide_k'] = 228.178
df_film.at[172, 'international_k'] = 26.916
df_film.at[172, 'worldwide_k'] = 26.916
df_film.at[181, 'international_k'] = 14.524
df_film.at[181, 'worldwide_k'] = 14.524
df_film.at[190, 'international_k'] = 1.098
df_film.at[190, 'worldwide_k'] = 1.098
df_film.at[192, 'international_k'] = 198.992
df_film.at[192, 'worldwide_k'] = 198.992
df_film.at[194, 'international_k'] = 286.085
df_film.at[194, 'worldwide_k'] = 286.085
df_film.at[202, 'international_k'] = 1740.429
df_film.at[202, 'worldwide_k'] = 1740.429
df_film.at[207, 'international_k'] = 40.468
df_film.at[207, 'worldwide_k'] = 40.468
df_film.at[229, 'international_k'] = 72.275
df_film.at[229, 'worldwide_k'] = 72.275
df_film.at[233, 'international_k'] = 46749.646
df_film.at[233, 'worldwide_k'] = 46749.646
df_film.at[234, 'international_k'] = 14.480
df_film.at[234, 'worldwide_k'] = 14.480
df_film.at[236, 'international_k'] = 18612.999
df_film.at[236, 'worldwide_k'] = 18612.999
df_film.at[237, 'domestic_k'] = 35.566
df_film.at[237, 'worldwide_k'] = 35.566

make_checkpoint()

In [33]:
# Check again
df_film['dom_pct'] = df_film['domestic_k']/df_film['worldwide_k']
df_film['int_pct'] = df_film['international_k']/df_film['worldwide_k']
df_film['dom_pct'] = df_film['dom_pct'].round(decimals = 4)
df_film['int_pct'] = df_film['int_pct'].round(decimals = 4)
df_film['check'] = df_film['dom_pct'] + df_film['int_pct']
df_film['check'] = df_film['check'].fillna(0)
make_checkpoint()
df_film['check'].value_counts()

1.0000    234
0.0000     10
0.8980      1
0.8470      1
0.8362      1
0.9531      1
0.9945      1
0.9078      1
Name: check, dtype: int64

In [34]:
# Drop the check column
df_film = df_film.drop('check', axis = 1)
make_checkpoint()

In [35]:
# Create place by genre column
df_film['place_bygenre'] = df_film.groupby('genre')['place'].rank(ascending=False)
make_checkpoint()

In [36]:
# Create duration rank column
df_film['duration_rank'] = df_film['duration'].rank(ascending=False)
make_checkpoint()

In [37]:
# Create duration by genre column
df_film['duration_rank_bygenre'] = df_film.groupby('genre')['duration'].rank(ascending=False)
make_checkpoint()

In [38]:
# Create income rank column
df_film['income_rank'] = df_film['worldwide_k'].rank(ascending=False)
make_checkpoint()

In [39]:
# Create income rank by genre column
df_film['income_rank_bygenre'] = df_film.groupby('genre')['worldwide_k'].rank(ascending=False)
make_checkpoint()

In [40]:
# Calculate the difference between rating rank (place) and income rank
df_film['rank_diff'] = df_film['income_rank'] - df_film['place']
make_checkpoint()

In [41]:
# Check for missing values
df_film.isnull().sum()

place                     0
title                     0
year                      0
director                  0
cast                      0
rating                    0
genre                     0
duration                  0
domestic_k                0
international_k           0
worldwide_k               0
distributor               0
dom_pct                  10
int_pct                  10
place_bygenre             0
duration_rank             0
duration_rank_bygenre     0
income_rank               0
income_rank_bygenre       0
rank_diff                 0
dtype: int64

In [42]:
# Fill in NaN with 0
df_film.fillna(0, inplace=True)
make_checkpoint()

In [43]:
# Check again
df_film.isnull().sum()

place                    0
title                    0
year                     0
director                 0
cast                     0
rating                   0
genre                    0
duration                 0
domestic_k               0
international_k          0
worldwide_k              0
distributor              0
dom_pct                  0
int_pct                  0
place_bygenre            0
duration_rank            0
duration_rank_bygenre    0
income_rank              0
income_rank_bygenre      0
rank_diff                0
dtype: int64

In [44]:
# Adjust the order of the columns
df_film = df_film[['title', 'year', 'director', 'cast', 'distributor', 'genre', 'rating', 'place', 'place_bygenre', 
                   'duration', 'duration_rank', 'duration_rank_bygenre', 'domestic_k', 'international_k', 'worldwide_k', 
                   'dom_pct', 'int_pct', 'income_rank', 'income_rank_bygenre', 'rank_diff']]
make_checkpoint()

In [45]:
# Final version of film data frame
df_film

Unnamed: 0,title,year,director,cast,distributor,genre,rating,place,place_bygenre,duration,duration_rank,duration_rank_bygenre,domestic_k,international_k,worldwide_k,dom_pct,int_pct,income_rank,income_rank_bygenre,rank_diff
0,The Shawshank Redemption,1994,Frank Darabont,"Tim Robbins, Morgan Freeman",Columbia Pictures,Drama,9.2335,1,68.0,142,70.5,20.0,28767.189,117.315,28884.504,0.9959,0.0041,150.0,32.0,149.0
1,The Godfather,1972,Francis Ford Coppola,"Marlon Brando, Al Pacino",Paramount Pictures,Crime,9.1554,2,34.0,175,22.0,5.0,136381.073,113960.743,250341.816,0.5448,0.4552,72.0,6.0,70.0
2,The Dark Knight,2008,Christopher Nolan,"Christian Bale, Heath Ledger",Warner Bros.,Action,8.9842,3,48.0,152,50.5,12.5,534987.076,471115.201,1006102.277,0.5317,0.4683,11.0,7.0,8.0
3,The Godfather: Part II,1974,Francis Ford Coppola,"Al Pacino, Robert De Niro",Paramount Pictures,Crime,8.9837,4,33.0,202,6.0,2.0,47834.595,126.415,47961.010,0.9974,0.0026,131.0,14.0,127.0
4,12 Angry Men,1957,Sidney Lumet,"Henry Fonda, Lee J. Cobb",United Artists,Crime,8.9464,5,32.0,96,220.5,33.0,0.000,0.955,0.955,0.0000,1.0000,240.0,32.0,235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Aladdin,1992,Ron Clements,"Scott Weinger, Robin Williams",Walt Disney Studios Motion Pictures,Adventure,8.0070,246,2.0,90,233.0,34.0,217350.219,286700.000,504050.219,0.4312,0.5688,32.0,12.0,-214.0
246,The Help,2011,Tate Taylor,"Emma Stone, Viola Davis",Walt Disney Studios Motion Pictures,Drama,8.0049,247,1.0,146,62.5,15.5,169708.112,46931.000,216639.112,0.7834,0.2166,79.0,11.0,-168.0
247,Beauty and the Beast,1991,Gary Trousdale,"Paige O'Hara, Robby Benson",Walt Disney Studios Motion Pictures,Animation,8.0045,248,1.0,84,245.0,5.0,218967.620,186043.788,424967.620,0.5153,0.4378,43.0,1.0,-205.0
248,Du rififi chez les hommes,1955,Jules Dassin,"Jean Servais, Carl Möhner",0,Crime,8.0024,249,1.0,118,157.5,20.5,517.975,3.367,521.342,0.9935,0.0065,204.0,23.0,-45.0


In [46]:
# Convert the data frame into spark data frame
df_film_spark = spark.createDataFrame(df_film)

In [47]:
df_film_spark.printSchema()

root
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- distributor: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- place: long (nullable = true)
 |-- place_bygenre: double (nullable = true)
 |-- duration: long (nullable = true)
 |-- duration_rank: double (nullable = true)
 |-- duration_rank_bygenre: double (nullable = true)
 |-- domestic_k: double (nullable = true)
 |-- international_k: double (nullable = true)
 |-- worldwide_k: double (nullable = true)
 |-- dom_pct: double (nullable = true)
 |-- int_pct: double (nullable = true)
 |-- income_rank: double (nullable = true)
 |-- income_rank_bygenre: double (nullable = true)
 |-- rank_diff: double (nullable = true)



In [48]:
# Convert the data frame into parquet format
df_film_spark.write.parquet("/project/Individual/parquet_files/film.parquet", mode = 'overwrite')
make_checkpoint()