In [1]:
# dependencies
import pandas as pd

In [2]:
# Read in the Golden Globe data into a dataframe
golden_globe_data = pd.read_csv('Raw Data/goldenglobes.csv')

golden_globe_data.head(10)

Unnamed: 0,title,year,status,award
0,Succession,2024,Winner,Best Television Series - Drama
1,Mission: Impossible - Dead Reckoning Part 1,2024,Nominee,Cinematic and Box Office Achievement
2,"Last of Us, The",2024,Nominee,Best Television Series - Drama
3,Taylor Swift: The Eras Tour,2024,Nominee,Cinematic and Box Office Achievement
4,John Wick: Chapter 4,2024,Nominee,Cinematic and Box Office Achievement
5,The Super Mario Bros. Movie,2024,Nominee,Cinematic and Box Office Achievement
6,Guardians of the Galaxy Vol. 3,2024,Nominee,Cinematic and Box Office Achievement
7,Spider-Man: Across the Spider-Verse,2024,Nominee,Cinematic and Box Office Achievement
8,Oppenheimer,2024,Nominee,Cinematic and Box Office Achievement
9,Barbie,2024,Winner,Cinematic and Box Office Achievement


In [3]:
# Check for null values
golden_globe_data.info()

# Find rows with null values
null_rows = golden_globe_data[golden_globe_data.isnull().any(axis=1)]
null_rows.head(10)

# Remove null values as they are not related to movie awards
golden_globe_data = golden_globe_data.dropna()

golden_globe_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8690 entries, 0 to 8689
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   8660 non-null   object
 1   year    8690 non-null   int64 
 2   status  8690 non-null   object
 3   award   8690 non-null   object
dtypes: int64(1), object(3)
memory usage: 271.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 8660 entries, 0 to 8689
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   8660 non-null   object
 1   year    8660 non-null   int64 
 2   status  8660 non-null   object
 3   award   8660 non-null   object
dtypes: int64(1), object(3)
memory usage: 338.3+ KB


In [4]:
# Remove awards that contain "Television" in the string
## We are only looking at the movie golden globe awards
golden_globe_data = golden_globe_data[~golden_globe_data['award'].str.contains("Television", na=False)]

golden_globe_data.head(25)

Unnamed: 0,title,year,status,award
1,Mission: Impossible - Dead Reckoning Part 1,2024,Nominee,Cinematic and Box Office Achievement
3,Taylor Swift: The Eras Tour,2024,Nominee,Cinematic and Box Office Achievement
4,John Wick: Chapter 4,2024,Nominee,Cinematic and Box Office Achievement
5,The Super Mario Bros. Movie,2024,Nominee,Cinematic and Box Office Achievement
6,Guardians of the Galaxy Vol. 3,2024,Nominee,Cinematic and Box Office Achievement
7,Spider-Man: Across the Spider-Verse,2024,Nominee,Cinematic and Box Office Achievement
8,Oppenheimer,2024,Nominee,Cinematic and Box Office Achievement
9,Barbie,2024,Winner,Cinematic and Box Office Achievement
11,Mission: Impossible - Dead Reckoning Part 1,2024,Nominee,Cinematic and Box Office Achievement
13,Taylor Swift: The Eras Tour,2024,Nominee,Cinematic and Box Office Achievement


In [5]:
# Check that all awards relate to movies
golden_globe_data['award'].unique()

array(['Cinematic and Box Office Achievement',
       'Best Screenplay - Motion Picture',
       'Best Director - Motion Picture',
       'Best Performance by a Male Actor in a Supporting Role in any Motion Picture',
       'Best Motion Picture – Non-English Language',
       'Best Motion Picture - Animated',
       'Best Performance by a Male Actor in a Motion Picture – Musical or Comedy',
       'Best Performance by a Female Actor in a Motion Picture – Musical or Comedy',
       'Best Motion Picture - Musical or Comedy',
       'Best Performance by a Male Actor in a Motion Picture – Drama',
       'Best Performance by a Female Actor in a Motion Picture – Drama',
       'Best Original Song - Motion Picture',
       'Best Original Score - Motion Picture',
       'Best Motion Picture - Drama',
       'Best Performance by an Actress in a Motion Picture - Drama',
       'Best Performance by an Actor in a Motion Picture - Drama',
       'Best Performance by an Actress in a Motion Picture -

In [6]:
# How many rows of data left
golden_globe_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5418 entries, 1 to 8689
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5418 non-null   object
 1   year    5418 non-null   int64 
 2   status  5418 non-null   object
 3   award   5418 non-null   object
dtypes: int64(1), object(3)
memory usage: 211.6+ KB


In [8]:
winners = golden_globe_data[golden_globe_data['status'] == 'Winner']
winners

Unnamed: 0,title,year,status,award
9,Barbie,2024,Winner,Cinematic and Box Office Achievement
19,Barbie,2024,Winner,Cinematic and Box Office Achievement
23,"Justine Triet, Arthur Harari",2024,Winner,Best Screenplay - Motion Picture
33,"Justine Triet, Arthur Harari",2024,Winner,Best Screenplay - Motion Picture
43,"Justine Triet, Arthur Harari",2024,Winner,Best Screenplay - Motion Picture
...,...,...,...,...
8685,Billy Wilder,1946,Winner,Best Director - Motion Picture
8686,Angela Lansbury,1946,Winner,Best Performance by an Actress in a Supporting...
8687,J. Carroll Naish,1946,Winner,Best Performance by an Actor in a Supporting R...
8688,Ingrid Bergman,1946,Winner,Actress In A Leading Role


In [9]:
#update column name to match others in dataframes to import into SQL

winners.rename(columns={'title':'Movie Name'}, inplace=True)

winners

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  winners.rename(columns={'title':'Movie Name'}, inplace=True)


Unnamed: 0,Movie Name,year,status,award
9,Barbie,2024,Winner,Cinematic and Box Office Achievement
19,Barbie,2024,Winner,Cinematic and Box Office Achievement
23,"Justine Triet, Arthur Harari",2024,Winner,Best Screenplay - Motion Picture
33,"Justine Triet, Arthur Harari",2024,Winner,Best Screenplay - Motion Picture
43,"Justine Triet, Arthur Harari",2024,Winner,Best Screenplay - Motion Picture
...,...,...,...,...
8685,Billy Wilder,1946,Winner,Best Director - Motion Picture
8686,Angela Lansbury,1946,Winner,Best Performance by an Actress in a Supporting...
8687,J. Carroll Naish,1946,Winner,Best Performance by an Actor in a Supporting R...
8688,Ingrid Bergman,1946,Winner,Actress In A Leading Role


In [11]:
#save file to import into Postgres as csv file
winners.to_csv("./Clean Data/golden_globes_winners.csv")