In [2]:
# Import libraries
import pandas as pd
import numpy as np

In [4]:
# Load data
df = pd.read_csv('imdb_top_1000.csv')
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    1000 non-null   object 
 1   Series_Title   1000 non-null   object 
 2   Released_Year  1000 non-null   object 
 3   Certificate    899 non-null    object 
 4   Runtime        1000 non-null   object 
 5   Genre          1000 non-null   object 
 6   IMDB_Rating    1000 non-null   float64
 7   Overview       1000 non-null   object 
 8   Meta_score     843 non-null    float64
 9   Director       1000 non-null   object 
 10  Star1          1000 non-null   object 
 11  Star2          1000 non-null   object 
 12  Star3          1000 non-null   object 
 13  Star4          1000 non-null   object 
 14  No_of_Votes    1000 non-null   int64  
 15  Gross          831 non-null    object 
dtypes: float64(2), int64(1), object(13)
memory usage: 125.1+ KB


In [8]:
# Check for missing values
df.isnull().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [10]:
# Handling missing values
df.Certificate = df.Certificate.fillna('Unrated')
df['Meta_score']= df['Meta_score'].fillna(df['Meta_score'].mode()[0])
df['Gross'] = df['Gross'].str.replace(',', '').fillna(0).astype(float)

# Check null values again
df.isnull().sum().sum()

0

In [12]:
# Change release year to numeric
df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce')
df = df.dropna(subset=['Released_Year'])
df['Released_Year'] = df['Released_Year'].astype(int)

# Change runtime to numeric
df['Runtime'] = df['Runtime'].str.extract(r'(\d+)').astype(float)

# Change gross to numeric
df['Gross'] = pd.to_numeric(df['Gross'], errors='coerce')/1e6

In [14]:
# rename columns to include metric
df = df.rename(columns={'Runtime':'Runtime (min)','Gross':'Gross (M)'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 999 entries, 0 to 999
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Poster_Link    999 non-null    object 
 1   Series_Title   999 non-null    object 
 2   Released_Year  999 non-null    int64  
 3   Certificate    999 non-null    object 
 4   Runtime (min)  999 non-null    float64
 5   Genre          999 non-null    object 
 6   IMDB_Rating    999 non-null    float64
 7   Overview       999 non-null    object 
 8   Meta_score     999 non-null    float64
 9   Director       999 non-null    object 
 10  Star1          999 non-null    object 
 11  Star2          999 non-null    object 
 12  Star3          999 non-null    object 
 13  Star4          999 non-null    object 
 14  No_of_Votes    999 non-null    int64  
 15  Gross (M)      999 non-null    float64
dtypes: float64(4), int64(2), object(10)
memory usage: 132.7+ KB


In [18]:
# Split for genres
df['Genre'] = df['Genre'].astype(str)
genre_content = df.assign(genre=df['Genre'].str.split(',')).explode('genre')
genre_content['genre'] = genre_content['genre'].str.strip()
genre_content['genre'].unique()

array(['Drama', 'Crime', 'Action', 'Adventure', 'Biography', 'History',
       'Sci-Fi', 'Romance', 'Western', 'Fantasy', 'Comedy', 'Thriller',
       'Animation', 'Family', 'War', 'Mystery', 'Music', 'Horror',
       'Musical', 'Film-Noir', 'Sport'], dtype=object)

In [20]:
df = drop

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime (min),Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross (M)
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142.0,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28.341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175.0,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134.966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152.0,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534.858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202.0,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57.300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96.0,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4.360000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BNGEwMT...,Breakfast at Tiffany's,1961,A,115.0,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,166544,0.000000
996,https://m.media-amazon.com/images/M/MV5BODk3Yj...,Giant,1956,G,201.0,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,Rock Hudson,James Dean,Carroll Baker,34075,0.000000
997,https://m.media-amazon.com/images/M/MV5BM2U3Yz...,From Here to Eternity,1953,Passed,118.0,"Drama, Romance, War",7.6,"In Hawaii in 1941, a private is cruelly punish...",85.0,Fred Zinnemann,Burt Lancaster,Montgomery Clift,Deborah Kerr,Donna Reed,43374,30.500000
998,https://m.media-amazon.com/images/M/MV5BZTBmMj...,Lifeboat,1944,Unrated,97.0,"Drama, War",7.6,Several survivors of a torpedoed merchant ship...,78.0,Alfred Hitchcock,Tallulah Bankhead,John Hodiak,Walter Slezak,William Bendix,26471,0.000000


In [31]:
directors = df.groupby('Director')['Gross (M)'].sum().reset_index()
directors_sorted = directors.sort_values(by='Gross (M)', ascending=False)
directors_sorted['rank'] = directors_sorted['Gross (M)'].rank(ascending=False).astype(int)
directors_sorted.set_index('rank').head(10)

Unnamed: 0_level_0,Director,Gross (M)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Steven Spielberg,2478.133165
2,Anthony Russo,2205.039403
3,Christopher Nolan,1937.454106
4,James Cameron,1748.236602
5,Peter Jackson,1597.312443
6,J.J. Abrams,1423.170905
7,Brad Bird,1099.627795
8,Robert Zemeckis,1049.446456
9,David Yates,978.953721
10,Pete Docter,939.382131


In [35]:
actors_1 = df.groupby('Star1')['Gross (M)'].sum().reset_index().rename(columns={'Star1': 'Actor', 'Gross (M)': 'Gross_1'})
actors_2 = df.groupby('Star2')['Gross (M)'].sum().reset_index().rename(columns={'Star2': 'Actor', 'Gross (M)': 'Gross_2'})
actors_3 = df.groupby('Star3')['Gross (M)'].sum().reset_index().rename(columns={'Star3': 'Actor', 'Gross (M)': 'Gross_3'})
actors_4 = df.groupby('Star4')['Gross (M)'].sum().reset_index().rename(columns={'Star4': 'Actor', 'Gross (M)': 'Gross_4'})

# Merge all datasets on 'Actor' and use 'outer' join to keep all actors
actors_merged = actors_1
for df in [actors_2, actors_3, actors_4]:
    actors_merged = pd.merge(actors_merged, df, on='Actor', how='outer')

actors_merged['Gross (M)'] = actors_merged[['Gross_1', 'Gross_2', 'Gross_3', 'Gross_4']].sum(axis=1)
actors_merged = actors_merged.drop(columns=['Gross_1', 'Gross_2', 'Gross_3', 'Gross_4'])

actors_sorted = actors_merged.sort_values(by='Gross (M)', ascending=False)
actors_sorted['rank'] = actors_sorted['Gross (M)'].rank(ascending=False).astype(int)
actors_sorted.set_index('rank').head(10)

Unnamed: 0_level_0,Actor,Gross (M)
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Robert Downey Jr.,3129.073242
2,Tom Hanks,2729.727051
3,Chris Evans,2339.664431
4,Joe Russo,2205.039403
5,Mark Ruffalo,2058.395565
6,Leonardo DiCaprio,2049.2974
7,Ian McKellen,1869.868915
8,Rupert Grint,1835.901034
8,Daniel Radcliffe,1835.901034
10,Matt Damon,1728.541599
