In [21]:
import pandas as pd
import re

In [22]:
# Convert the csv into a Data Frame

df_bafta = pd.read_csv("../data/bafta_films.csv")

In [23]:
# Check for any duplicate

df_bafta.duplicated().sum()

1

In [24]:
# Drop the duplicates

df_bafta = df_bafta.drop_duplicates()
df_bafta.duplicated().sum()

0

In [25]:
# Check for any null value

df_bafta.isnull().sum()

year          0
category      0
nominee       0
workers     773
winner        0
dtype: int64

In [26]:
# Standarizing the column names
# Replace the booleans with Nominee and Winner

df_bafta["winner"] = df_bafta['winner'].replace(False, "Nominee").replace(True, "Winner") 

# Rename the columns with the name of the award

df_bafta.rename(columns={"year":"year_ceremony","nominee":"name","workers":"film",'winner':"bafta"}, inplace=True)

In [27]:
# Clean the category names, and replace them

category = {cat : cat.strip().replace("Film | ", "") for cat in df_bafta["category"]}
df_bafta["category"].replace(category, inplace = True)

In [28]:
# More cleaning
# Drops all the year info contained in category by applying a lambda with a regex pattern that takes off all the "_in_{4 numeric values}"

df_bafta["category"] = df_bafta["category"].apply(lambda cat: re.sub(r"\sin\s\d{4}", "", cat)) 

In [29]:
# Filter the years from 1990 to 2022 to make our analysis

df_bafta_filter = df_bafta[df_bafta["year_ceremony"].between(1990, 2022, inclusive=True)]

  df_bafta_filter = df_bafta[df_bafta["year_ceremony"].between(1990, 2022, inclusive=True)]


In [30]:
# Replace the old category names with the new oficial given ones

df_bafta_filter["category"] = df_bafta_filter['category'].replace("Adapted Screenplay","Writing (Adapted Screenplay)")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Film","Best Picture")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Film Not in the English Language","International Feature Film")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Original Screenplay","Writing (Original Screenplay)")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Director","Directing")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Achievement in Direction","Directing")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Leading Actor","Actor In A Leading Role")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Leading Actress","Actress in a Leading Role")
df_bafta_filter["category"] = df_bafta_filter['category'].replace("Best Film","Best Picture")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bafta_filter["category"] = df_bafta_filter['category'].replace("Adapted Screenplay","Writing (Adapted Screenplay)")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bafta_filter["category"] = df_bafta_filter['category'].replace("Film","Best Picture")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [31]:
# Filter, by choosing the categories I choose for my anaylisis

filter = ['Actor in a Leading Role', 'Actor in a Supporting Role', 'Actress in a Leading Role', 'Actress in a Supporting Role',
'Writing (Adapted Screenplay)','Writing (Original Screenplay)', 'Directing', 'Best Picture']

df_bafta_filter = df_bafta_filter.loc[df_bafta_filter['category'].isin(filter)]

In [32]:
# List of misplaced names to swap columns with film and viceversa

categories = ["Writing (Adapted Screenplay)", "Writing (Original Screenplay)", "Best Picture"]

# Iterate through the categories

for category in categories:

    # Store the values of the 'film' column in a temporary variable

    temp = df_bafta_filter.loc[df_bafta_filter["category"] == category, "film"]

    # assign the values of the 'names' column to the 'film' column

    df_bafta_filter.loc[df_bafta_filter["category"] == category, "film"] = df_bafta_filter.loc[df_bafta_filter["category"] == category, "name"]

    # assign the values of the temporary variable (previous values of 'film' column) to the 'names' column
    
    df_bafta_filter.loc[df_bafta_filter["category"] == category, "name"] = temp

In [33]:
# Locate the nans, and non numeric values with .loc and replace them with their correct names

position = [3845, 3846, 3847, 3848, 3849]
incorrect_names = ["The Shape of Water", "Blade Runner 2049", "Call Me by Your Name", "Dunkirk", "Three Billboards Outside Ebbing, Missouri"]
correct_names = ["Guillermo del Toro", "Denis Villeneuve", "Luca Guadagnino", "Christopher Nolan", "Martin McDonagh"] 

# Use the .loc indexer to select the rows at the specified positions

df_bafta_filter.loc[position, "film"] = incorrect_names
df_bafta_filter.loc[position, "name"] = correct_names

In [34]:
# Convert to csv and store it in data folder

df_bafta_filter.to_csv("../data/BAFTA_clean.csv")