## Oscar Awards Best Picture

In [1]:
#importing libraries
import pandas as pd
import numpy as np
import string
import re

In [2]:
# importing the dataset
oscar_df_1 = pd.read_excel('data/the_oscar_award_kaggle.xlsx')

In [3]:
# Inspecting DataFrame
oscar_df_1.head()
oscar_df_1.shape
oscar_df_1.dtypes
oscar_df_1.isna().sum()

year_film          0
year_ceremony      0
ceremony           0
category           0
name               0
film             304
winner             0
dtype: int64

In [4]:
# inspecting the missing values in film\n",
missing_films = oscar_df_1[oscar_df_1.isna().any(axis=1)]
missing_films['category'].value_counts()
#There are no missing values for the best picture category. At this point, there is no need to remove the rows with them.

HONORARY AWARD                          124
SPECIAL AWARD                            56
IRVING G. THALBERG MEMORIAL AWARD        45
JEAN HERSHOLT HUMANITARIAN AWARD         39
ASSISTANT DIRECTOR                       18
SOUND RECORDING                           8
HONORARY FOREIGN LANGUAGE FILM AWARD      5
SPECIAL ACHIEVEMENT AWARD                 3
ENGINEERING EFFECTS                       2
SPECIAL FOREIGN LANGUAGE FILM AWARD       2
WRITING (Title Writing)                   2
Name: category, dtype: int64

In [6]:
oscar_df_1['category'].value_counts()

DIRECTING                                    449
FILM EDITING                                 430
ACTOR IN A SUPPORTING ROLE                   420
ACTRESS IN A SUPPORTING ROLE                 420
DOCUMENTARY (Short Subject)                  368
                                            ... 
WRITING (Title Writing)                        3
DIRECTING (Comedy Picture)                     2
SPECIAL FOREIGN LANGUAGE FILM AWARD            2
SPECIAL ACHIEVEMENT AWARD (Sound Editing)      1
SPECIAL ACHIEVEMENT AWARD (Sound Effects)      1
Name: category, Length: 111, dtype: int64

## 2. Creating a dataframe with best picture category

In [8]:
#making a copy of the entire dataset
pictures = oscar_df_1.copy()

#creating categories for filtering
cats = ['OUTSTANDING PICTURE','OUTSTANDING PRODUCTION','OUTSTANDING MOTION PICTURE','BEST MOTION PICTURE','BEST PICTURE']

#selecting only movies that were nominated in the best picture category
pictures = pictures.loc[pictures['category'].isin(cats)]
pictures[pictures.isna().any(axis=1)]
# there are no missing values.

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner


In [10]:
pictures

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
19,1928,1928,1,OUTSTANDING PICTURE,The Caddo Company,The Racket,False
20,1927,1928,1,OUTSTANDING PICTURE,Fox,7th Heaven,False
21,1927,1928,1,OUTSTANDING PICTURE,Paramount Famous Lasky,Wings,True
62,1929,1929,2,OUTSTANDING PICTURE,Feature Productions,Alibi,False
63,1928,1929,2,OUTSTANDING PICTURE,Fox,In Old Arizona,False
...,...,...,...,...,...,...,...
10346,2019,2020,92,BEST PICTURE,"Amy Pascal, Producer",Little Women,False
10347,2019,2020,92,BEST PICTURE,"Noah Baumbach and David Heyman, Producers",Marriage Story,False
10348,2019,2020,92,BEST PICTURE,"Sam Mendes, Pippa Harris, Jayne-Ann Tenggren a...",1917;,False
10349,2019,2020,92,BEST PICTURE,"David Heyman, Shannon McIntosh and Quentin Tar...",Once upon a Time… in Hollywood,False


## 3. Merging Oscar Data With IMDb Movies details

In [16]:
# Importing a csv file with over 80 000 movies and their details from IMDB
IMDB_movies = pd.read_excel('data/IMDB movies extensive dataset kaggle/IMDb_movies.xlsx')

In [17]:
# Inspecting the dataframe

IMDB_movies.shape
# 81273 movies , variables stored in 22 columns

IMDB_movies.head()
IMDB_movies.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

In [19]:
# dropping unnecessary columns:


IMDB_movies=IMDB_movies.drop(columns = ['original_title','date_published','avg_vote','votes','budget','usa_gross_income',
                                         'worlwide_gross_income', 'metascore','reviews_from_users', 'reviews_from_critics'])

In [21]:
IMDB_movies.head()
IMDB_movies.shape 

(81273, 12)

In [22]:
#make titles in both dataframes upper-case, to minimize the possible errors caused by different capitalization.
IMDB_movies['title'] = IMDB_movies['title'].str.upper()
pictures['film'] = pictures['film'].str.upper()

#making sure there is a blank space after a comma in names of movies
IMDB_movies['title'] = IMDB_movies['title'].str.replace(',',', ')
pictures['film'] = pictures['film'].str.replace(',',', ')

#Stripping potential blank spaces
IMDB_movies['title'] = IMDB_movies['title'].str.strip()
pictures['film'] = pictures['film'].str.strip()

Most of types look alright.This table will be merged with the pictures table on a left join- so that only the information for the selected oscar films is added

In [25]:
pictures_combined = pd.merge(pictures, IMDB_movies, how='left', left_on=['film', 'year_film'], right_on=['title','year'])

In [26]:
pictures_combined.loc[pictures_combined.film=='LITTLE WOMEN']
# This film from 2019 is wrong in the imdb kaggle dataset. the oscar nominee was Greta Gedwig`s film. it will be manually changed below.

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,imdb_title_id,title,year,genre,duration,country,language,director,writer,production_company,actors,description
31,1933,1933,6,OUTSTANDING PRODUCTION,RKO Radio,LITTLE WOMEN,False,tt0024264,LITTLE WOMEN,1933.0,"Drama, Family, Romance",115.0,USA,"English, German",George Cukor,"Louisa May Alcott, Sarah Y. Mason",RKO Radio Pictures,"Katharine Hepburn, Joan Bennett, Paul Lukas, E...",A chronicle of the lives of a group of sisters...
558,2019,2020,92,BEST PICTURE,"Amy Pascal, Producer",LITTLE WOMEN,False,tt6495094,LITTLE WOMEN,2019.0,"Drama, Family",112.0,USA,English,Clare Niederpruem,"Louisa May Alcott, Clare Niederpruem",Main Dog Productions,"Sarah Davenport, Lea Thompson, Allie Jennings,...",A modern retelling of Louisa May Alcott's clas...


In [29]:
# changing little women info

pictures_combined.at[558,'imdb_title_id'] = 'tt3281548'
pictures_combined.at[558,'director'] = 'Greta Gedwig'
pictures_combined.at[558,'actors'] = "'Saoirse Ronan', 'Emma Watson', 'Florence Pugh', 'Eliza Scanlen','Laura Dern','Meryl Streep'"
pictures_combined.at[558,'director'] = 'Greta Gerwig'
pictures_combined.at[558,'genre'] = "'Drama','Romance'"
pictures_combined.at[558,'duration'] = 135

In [32]:
pictures_combined[pictures_combined['title'].isnull()]

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,imdb_title_id,title,year,genre,duration,country,language,director,writer,production_company,actors,description
7,1928,1929,2,OUTSTANDING PICTURE,Paramount Famous Lasky,THE PATRIOT,False,,,,,,,,,,,,
14,1931,1931,4,OUTSTANDING PRODUCTION,Fox,EAST LYNNE,False,,,,,,,,,,,,
47,1934,1935,7,OUTSTANDING PRODUCTION,Jesse L. Lasky (production company),THE WHITE PARADE,False,,,,,,,,,,,,
560,2019,2020,92,BEST PICTURE,"Sam Mendes, Pippa Harris, Jayne-Ann Tenggren a...",1917;,False,,,,,,,,,,,,
561,2019,2020,92,BEST PICTURE,"David Heyman, Shannon McIntosh and Quentin Tar...",ONCE UPON A TIME… IN HOLLYWOOD,False,,,,,,,,,,,,
562,2019,2020,92,BEST PICTURE,"Kwak Sin Ae and Bong Joon Ho, Producers",PARASITE,True,,,,,,,,,,,,


In [41]:
# 6 films were not successfully merged(they were missing in the IMDB dataset). the Main infromation will be added manually:
pictures_combined.at[7,'imdb_title_id'] = 'tt0019257'
pictures_combined.at[7,'genre'] = 'Drama, History, Thriller'
pictures_combined.at[7,'director'] = 'Ernst Lubitsch'
pictures_combined.at[7,'duration'] = 113


pictures_combined.at[14,'imdb_title_id'] = 'tt0021826'
pictures_combined.at[14,'genre'] = 'Drama, Romance'
pictures_combined.at[14,'director'] = 'Frank Lloyd'
pictures_combined.at[14,'duration'] = 102


pictures_combined.at[47,'imdb_title_id'] = 'tt0025986'
pictures_combined.at[47,'genre'] = 'Drama'
pictures_combined.at[47,'director'] = 'Irving Cummings'
pictures_combined.at[47,'duration'] = 80


pictures_combined.at[560,'imdb_title_id'] = 'tt8579674'
pictures_combined.at[560,'genre'] = 'Drama, War'
pictures_combined.at[560,'director'] = 'Sam Mendes'
pictures_combined.at[560,'duration'] = 119

pictures_combined.at[561,'imdb_title_id'] = 'tt7131622'
pictures_combined.at[561,'genre'] = 'Comedy,Drama'
pictures_combined.at[561,'director'] = 'Quentin Tarantino'
pictures_combined.at[561,'duration'] = 161

pictures_combined.at[562,'imdb_title_id'] = 'tt6751668'
pictures_combined.at[562,'genre'] = 'Comedy,Drama, Thriller'
pictures_combined.at[562,'director'] = 'Bong Joon Ho'
pictures_combined.at[562,'duration'] = 132

In [44]:
# Some movies have two directors. Splitting their names in two columns for later checking
pictures_combined[['first_director','second_director']] = pictures_combined.director.str.split(',', expand = True)

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,imdb_title_id,title,year,...,duration,country,language,director,writer,production_company,actors,description,first_director,second_director
0,1928,1928,1,OUTSTANDING PICTURE,The Caddo Company,THE RACKET,False,tt0019304,THE RACKET,1928.0,...,84.0,USA,English,Lewis Milestone,"Bartlett Cormack, Bartlett Cormack",The Caddo Company,"Thomas Meighan, Louis Wolheim, Marie Prevost, ...",An honest police captain vows to bring down a ...,Lewis Milestone,
1,1927,1928,1,OUTSTANDING PICTURE,Fox,7TH HEAVEN,False,tt0018379,7TH HEAVEN,1927.0,...,110.0,USA,English,Frank Borzage,"Austin Strong, Benjamin Glazer",Frank Borzage Production,"Janet Gaynor, Charles Farrell, Albert Gran, Da...","A street cleaner saves a young woman's life, a...",Frank Borzage,
2,1927,1928,1,OUTSTANDING PICTURE,Paramount Famous Lasky,WINGS,True,tt0018578,WINGS,1927.0,...,144.0,USA,English,"William A. Wellman, Harry d'Abbadie d'Arrast","John Monk Saunders, Hope Loring",Paramount Famous Lasky Corporation,"Clara Bow, Charles 'Buddy' Rogers, Richard Arl...","Two young men, one rich, one middle class, who...",William A. Wellman,Harry d'Abbadie d'Arrast
3,1929,1929,2,OUTSTANDING PICTURE,Feature Productions,ALIBI,False,tt0019630,ALIBI,1929.0,...,91.0,USA,English,Roland West,"Roland West, C. Gardner Sullivan",Feature Productions,"Chester Morris, Harry Stubbs, Mae Busch, Elean...","Chick Williams, a prohibition gangster, rejoin...",Roland West,
4,1928,1929,2,OUTSTANDING PICTURE,Fox,IN OLD ARIZONA,False,tt0020018,IN OLD ARIZONA,1928.0,...,95.0,USA,"English, Spanish, Italian",Irving Cummings,"O. Henry, Tom Barry",Fox Film Corporation,"Warner Baxter, Edmund Lowe, Dorothy Burgess","A charming, happy-go-lucky bandit in old Arizo...",Irving Cummings,


In [64]:
pictures_combined.to_csv('data/pictures_combined.csv',index=False)