In [10]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path

# File to Load (Remember to Change These)
oscars_file = Path("Resources/oscars.csv")
oscar_winners_file = Path("Resources/Oscar Winners - Director.csv")

# Read CSV files and store into Pandas DataFrames
oscars = pd.read_csv(oscars_file)
oscar_winners = pd.read_csv(oscar_winners_file)


In [2]:
# Drop the "year" column
oscar_winners = oscar_winners.drop(columns=["index"])

In [3]:
# Drop the "year" column
oscars = oscars.drop(columns=["oscar_no"])

In [4]:
oscars = oscars.rename(columns={"oscar_yr": "year"})
oscar_winners = oscar_winners.rename(columns={'Nomination/Winner': 'nomination', 'Director(s)': 'director', 'Race': 'race'})

In [5]:
print("\nFirst few rows of oscars DataFrame:")
print(oscars.head())

print("\nFirst few rows of oscar_winners DataFrame:")
print(oscar_winners.head())


First few rows of oscars DataFrame:
   year         award            name                       movie  age  \
0  1929  Best actress    Janet Gaynor                  7th Heaven   22   
1  1930  Best actress   Mary Pickford                    Coquette   37   
2  1931  Best actress   Norma Shearer                The Divorcee   28   
3  1932  Best actress  Marie Dressler                Min and Bill   63   
4  1933  Best actress     Helen Hayes  The Sin of Madelon Claudet   32   

        birth_pl  birth_date  birth_mo  birth_d  birth_y  
0   Pennsylvania  1906-10-06        10        6     1906  
1         Canada  1892-04-08         4        8     1892  
2         Canada  1902-08-10         8       10     1902  
3         Canada  1868-11-09        11        9     1868  
4  Washington DC  1900-10-10        10       10     1900  

First few rows of oscar_winners DataFrame:
     Year Gender   race                          director  \
0  1928.0   Male  White  Frank Borzage (Dramatic Picture)  

In [6]:
oscar_winners = oscar_winners.rename(columns={"Year": "year"})

# Merge the DataFrames based on the "Year" column
merged_data = pd.merge(oscar_winners, oscars, how="inner", on="year")

# Display the first few rows of the merged DataFrame
print("\nFirst few rows of the merged DataFrame:")
print(merged_data.head())

merged_data = merged_data.rename(columns={'Nomination/Winner': 'Nomination', 'Director(s)': 'director', 'Race': 'race'})



First few rows of the merged DataFrame:
     year Gender   race          director                 Film  nomination  \
0  1929.0   Male  White       Frank Lloyd      The Divine Lady      Winner   
1  1929.0   Male  White       Frank Lloyd      The Divine Lady      Winner   
2  1929.0   Male  White  Lionel Barrymore             Madame X  Nomination   
3  1929.0   Male  White  Lionel Barrymore             Madame X  Nomination   
4  1929.0   Male  White    Harry Beaumont  The Broadway Melody  Nomination   

          award           name                           movie  age  \
0  Best actress   Janet Gaynor                      7th Heaven   22   
1    Best actor  Emil Jannings  The Way of All Flesh & The Las   44   
2  Best actress   Janet Gaynor                      7th Heaven   22   
3    Best actor  Emil Jannings  The Way of All Flesh & The Las   44   
4  Best actress   Janet Gaynor                      7th Heaven   22   

       birth_pl  birth_date  birth_mo  birth_d  birth_y  
0  Pe

In [7]:
# Handle non-finite values in the "year" column
oscar_winners["year"] = oscar_winners["year"].fillna(0)  # Replace NaN with 0 (or any other appropriate value)

# Convert the "year" column to integers
oscar_winners["year"] = oscar_winners["year"].astype(int)

In [8]:
# Convert numerical columns to integers in oscars DataFrame
oscars["age"] = pd.to_numeric(oscars["age"], errors='coerce').astype('Int64')
oscars["birth_mo"] = pd.to_numeric(oscars["birth_mo"], errors='coerce').astype('Int64')
oscars["birth_d"] = pd.to_numeric(oscars["birth_d"], errors='coerce').astype('Int64')
oscars["birth_y"] = pd.to_numeric(oscars["birth_y"], errors='coerce').astype('Int64')

# Convert numerical columns to integers in oscar_winners DataFrame
oscar_winners["year"] = pd.to_numeric(oscar_winners["year"], errors='coerce').astype('Int64')

# Display the first few rows of the merged DataFrame
print("\nFirst few rows of the merged DataFrame:")
print(merged_data.head())


First few rows of the merged DataFrame:
     year Gender   race          director                 Film  nomination  \
0  1929.0   Male  White       Frank Lloyd      The Divine Lady      Winner   
1  1929.0   Male  White       Frank Lloyd      The Divine Lady      Winner   
2  1929.0   Male  White  Lionel Barrymore             Madame X  Nomination   
3  1929.0   Male  White  Lionel Barrymore             Madame X  Nomination   
4  1929.0   Male  White    Harry Beaumont  The Broadway Melody  Nomination   

          award           name                           movie  age  \
0  Best actress   Janet Gaynor                      7th Heaven   22   
1    Best actor  Emil Jannings  The Way of All Flesh & The Las   44   
2  Best actress   Janet Gaynor                      7th Heaven   22   
3    Best actor  Emil Jannings  The Way of All Flesh & The Las   44   
4  Best actress   Janet Gaynor                      7th Heaven   22   

       birth_pl  birth_date  birth_mo  birth_d  birth_y  
0  Pe

In [9]:
# Optionally, you can save the merged DataFrame back to a CSV file
merged_data.to_csv("merged_data.csv", index=False)