In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [2]:
data = pd.read_csv('data/movie_dataset_final.csv')
data.head()

Unnamed: 0,Year,Movie,Oscar_winner,Oscar_nominee,Runtime (min),Certificate,Directors,Actors,Metascore,IMDb_rating,...,Golden_Bear_winner,Golden_Bear_nominee,Golden_Lion_winner,Golden_Lion_nominee,PCA_winner,PCA_nominee,NYFCC_winner,NYFCC_nominee,OFCS_winner,OFCS_nominee
0,1999,Fight Club,0,0,139,R(A),David Fincher,"['Brad Pitt', 'Edward Norton', 'Meat Loaf', 'Z...",66,8.8,...,0,0,0,0,0,0,0,0,0,1
1,1999,The Matrix,0,0,136,PG,Lana Wachowski Lilly Wachowski,"['Keanu Reeves', 'Laurence Fishburne', 'Carrie...",73,8.7,...,0,0,0,0,0,0,0,0,0,0
2,1999,The Green Mile,0,1,189,R(A),Frank Darabont,"['Tom Hanks', 'Michael Clarke Duncan', 'David ...",61,8.6,...,0,0,0,0,0,0,0,0,0,0
3,1999,American Beauty,1,1,122,R(A),Sam Mendes,"['Kevin Spacey', 'Annette Bening', 'Thora Birc...",84,8.3,...,0,0,0,0,1,1,0,1,1,1
4,1999,The Sixth Sense,0,1,107,PG,M. Night Shyamalan,"['Bruce Willis', 'Haley Joel Osment', 'Toni Co...",64,8.1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Training Set - Excluding 2019
train = data.loc[((data['Year'] >= 1999) & (data['Year'] < 2019))]
test = data.loc[(data['Year']==2019) & (data['Oscar_nominee'] ==1)]

print('training set contains:', train.shape[0], 'movies')
print('Prediciting on:', test.shape[0], 'movies')

training set contains: 2000 movies
Prediciting on: 9 movies


In [4]:
# The 9 movie nominees
candidates = list(test.Movie)
print(candidates)

['Joker', 'Once Upon a Time in Hollywood', 'Parasite', 'The Irishman', '1917', 'Marriage Story', 'Ford v Ferrari', 'Jojo Rabbit', 'Little Women']


In [5]:
features = [ 'Runtime (min)', 'Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 
             'Drama', 'Family','Fantasy', 'History', 'Horror', 'Musical', 'Mystery', 'Romance', 
             'Sci-Fi', 'Sport','Thriller', 'War', 'Western','Budget','Domestic (US) gross',
             'International gross','Worldwide gross','Metascore', 'IMDb_rating', 'IMDb_votes', 'RT_rating', 'RT_review',
             'GG_drama_winner', 'GG_drama_nominee', 'GG_comedy_winner', 'GG_comedy_nominee',
             'BAFTA_winner', 'BAFTA_nominee', 'DGA_winner', 'DGA_nominee',
             'PGA_winner', 'PGA_nominee', 'CCMA_winner', 'CCMA_nominee',
             'Golden_Palm_winner', 'Golden_Palm_nominee', 'Golden_Bear_winner', 'Golden_Bear_nominee',
             'Golden_Lion_winner', 'Golden_Lion_nominee', 'PCA_winner', 'PCA_nominee',
             'NYFCC_winner', 'NYFCC_nominee', 'OFCS_winner', 'OFCS_nominee']  

In [6]:
voter1 = DecisionTreeClassifier(splitter='random',
                                max_depth=3,  # Low depth allows for some randomness
                                min_samples_leaf=3,
                                random_state = 92)

In [7]:
def simulate_a_vote(model, train_df, to_predict_df, features):
    """
    This function creates, trains, and predicts with a DecisionTree to simulate an Academy voter.
    Each tree only sees a part of the data and gets Noise to decorrelate them from each other.
    The prediction is then ranked to create our ballot for Preferential Balloting
    """
    
    train = train_df.copy()
    test = to_predict_df.copy()
    
    # A noise column, randomly generated each time represents a voter's bias
    train.loc[:,'Noise'] = np.random.rand(train_df.shape[0])
    test.loc[:,'Noise'] = np.random.rand(to_predict_df.shape[0])

    # Looking at a random amount of awards shows (similar to bootstrapping)
    # This reflects a voter's attention to the season
    # num_features is how many of the features they care about
    num_features = np.random.choice(int(len(features)/1.7))
    voter_features = list(np.random.choice(features, num_features)) + ['Noise']

    x = np.array(train[voter_features])
    y = np.array(train['Oscar_winner'])
    
    model.fit(x,y)
    
    # ProbA of the voter will represent the ranked votes
    ballot_clean = model.predict_proba(np.array(test[voter_features]))[:,1]
    # Add small random values to break up ties
    ballot = ballot_clean + np.random.rand(len(ballot_clean))/10000
    
    # Use np.argsort() to rank the order of the probA
    # The Academy uses ranked votes calculate winner
    temp = ballot.argsort()
    ranks = np.empty_like(temp)
    ranks[temp] = np.arange(len(ballot))
    ranks = np.abs(ranks - len(ballot))
    return ranks

In [8]:
def simulate_voting_body(num_voters, model, train_df, to_predict_df, features):
    """
    Runs simulate_a_vote and collects ballots from an academy of num_voters size
    """
    collected_ballots = np.zeros((num_voters, to_predict_df.shape[0]))
    for i in range(num_voters):
        collected_ballots[i,:] = simulate_a_vote(model, train_df, to_predict_df, features)
    return collected_ballots

In [9]:
n = 3
print(f'Here is an example of a {n}-person Academy:')
print(simulate_voting_body(n, voter1, train, test, features))

Here is an example of a 3-person Academy:
[[6. 4. 1. 2. 5. 8. 9. 3. 7.]
 [6. 1. 3. 4. 2. 7. 9. 5. 8.]
 [2. 5. 4. 3. 1. 8. 7. 6. 9.]]


In [10]:
def tally_votes(voting_body, list_of_nominees):
    # List of nominees must be in the same order as the vote index
    firsts = np.where(voting_body==1,1,0)
    tally = np.sum(firsts, axis = 0)
    tallied_votes_df = pd.DataFrame(tally, columns=['Votes']).T
    tallied_votes_df.columns = list_of_nominees
    return tallied_votes_df.T.sort_values('Votes', ascending = False)

In [11]:
n = 1000
this_academy = simulate_voting_body(n, voter1, train, test, features)
print(f"Overall, this {n}-person academy's top picks look like this:")
plot_df1 = tally_votes(this_academy, candidates)

Overall, this 1000-person academy's top picks look like this:


In [12]:
def remove_least(voting_body, list_of_nominees):
    """
    A function used for the elimination step of Preferential Balloting
    This function determines which film has the least #1 rankings and removes it
    """
    # List of nominees must be in the same order as the vote index
    firsts = np.where(voting_body==1,1,0)
    tally = np.sum(firsts, axis = 0)
    least_votes_index = np.argmin(tally)
    
    # Removes the least voted entry (from # 1 to 0)
    voting_body = np.delete(voting_body, least_votes_index, axis = 1)
    list_of_nominees.remove(list_of_nominees[least_votes_index])
    return voting_body, list_of_nominees

In [13]:
def re_rank_ballots(voting_body):
    """
    Another function used for the elimination step of Preferential Balloting
    Takes a voting body (numpy array)
    Makes sure each row goes from 1 to shape[1]
    """
    re_ranked = np.zeros(voting_body.shape)
    for i in range(voting_body.shape[0]):
        temp = voting_body[i,:].argsort()
        ranks = np.empty_like(temp)
        ranks[temp] = np.arange(len(voting_body[i,:]))
        re_ranked[i,:] = ranks + 1
    return re_ranked

In [14]:
def run_one_round_of_eliminations(voting_body, list_of_nominees):
    """
    A function which runs one elimination step of Preferential Balloting 
    Takes in a Voting Body and List of Nominess and returns them,
    but the film with the least #1 votes has bene removed
    """    
    voting_body, list_of_nominees = remove_least(voting_body, list_of_nominees)
    voting_body = re_rank_ballots(voting_body)
    return voting_body, list_of_nominees

In [15]:
# Dry run with 1,000 participants for just one round of eliminations
new_votes, new_noms = run_one_round_of_eliminations(this_academy, list(test.Movie))

print(len(new_noms), 'films remaining')
print('\nNew Standings:')
tally_votes(new_votes, new_noms)

8 films remaining

New Standings:


Unnamed: 0,Votes
1917,474
Once Upon a Time in Hollywood,229
Parasite,84
The Irishman,70
Joker,66
Jojo Rabbit,38
Marriage Story,20
Ford v Ferrari,19


In [16]:
def run_preferential_voting(voting_body,list_of_nominees, show_steps = False):
    """
    Runs the process of Preferential Balloting on a voting_body(matrix)
    Terminates when one movie has greater than 50% of the total votes
    """   
    top_pick_percent = tally_votes(voting_body,list_of_nominees).max()[0]/tally_votes(voting_body,list_of_nominees).sum()[0]
    
    while top_pick_percent < 0.5:
        voting_body,list_of_nominees = run_one_round_of_eliminations(voting_body, list_of_nominees)
        top_pick_percent = tally_votes(voting_body,list_of_nominees).max()[0]/tally_votes(voting_body,list_of_nominees).sum()[0] 
        
        if show_steps:
            print(tally_votes(voting_body, list_of_nominees),'\n')
            
    return voting_body, list_of_nominees

In [17]:
print('training set contains:', train.shape[0], 'Movies')
print('Prediciting on:', test.shape[0], 'Movies')

# PicK the model we want for each random voter
voter_model = DecisionTreeClassifier(splitter='random',
                                     max_depth=3,
                                     min_samples_leaf=3,
                                     random_state = 92)

num_voters_academy = 10000
print(f'\nSimulating an Academy with {num_voters_academy} random voters.....')
academy_sim = simulate_voting_body(num_voters=num_voters_academy, model = voter_model, train_df = train, to_predict_df = test, features=features)

print('\nInitial Rankings:\n----------------------------------------')
print(tally_votes(academy_sim, list(test.Movie)),'\n')

print("Now we start eliminating films untill there one has more than 50% of the top picks:\n------------------------------------------------------")
final_ballot, final_films = run_preferential_voting(academy_sim, list(test.Movie), True)

training set contains: 2000 Movies
Prediciting on: 9 Movies

Simulating an Academy with 10000 random voters.....

Initial Rankings:
----------------------------------------
                               Votes
1917                            4484
Once Upon a Time in Hollywood   2440
Parasite                         861
The Irishman                     819
Joker                            494
Jojo Rabbit                      460
Marriage Story                   229
Ford v Ferrari                   111
Little Women                     102 

Now we start eliminating films untill there one has more than 50% of the top picks:
------------------------------------------------------
                               Votes
1917                            4493
Once Upon a Time in Hollywood   2453
Parasite                         877
The Irishman                     835
Joker                            506
Jojo Rabbit                      467
Marriage Story                   244
Ford v Ferrari      

In [18]:
tally_votes(final_ballot, final_films)

Unnamed: 0,Votes
1917,5306
Once Upon a Time in Hollywood,3001
Parasite,1693


In [19]:
winner = np.array(tally_votes(final_ballot, final_films).reset_index())[0][0].split('(')[0].strip()
print(f'And the Oscar goes to...\n🎉🏆 {winner} 🏆🎉')

And the Oscar goes to...
🎉🏆 1917 🏆🎉
