In [119]:
import pandas as pd
import numpy as np
import re

def paranthesis_remover(entry):
    return re.sub('\(.+?\)', '', str(entry))

def one_hot_encoder(file_name):
    imdb_budget_df = pd.read_csv(file_name)
    
    # Include only movies, and successful API calls
    reduced_imdb = imdb_budget_df.loc[imdb_budget_df['Response'] == True]
    reduced_imdb = imdb_budget_df.loc[imdb_budget_df['Type'] == 'movie']

    # Remove movies without actors
    reduced_imdb = reduced_imdb.dropna(subset=['Actors'])

    # One-hot encode actors, genres, writers, directors, and languages
    ohe_actors = reduced_imdb.Actors.str.get_dummies(sep=',').add_prefix('actor_')
    ohe_genres = reduced_imdb.Genre.str.get_dummies(sep=',').add_prefix('genre_')
    ohe_languages = reduced_imdb.Language.str.get_dummies(sep=',').add_prefix('language_')

    # Writers and directors have parenthesis around some of their fields, like '(assistant director)', which needs to be removed
    reduced_imdb['Writer'] = reduced_imdb['Writer'].apply(paranthesis_remover)
    ohe_writers = reduced_imdb.Writer.str.get_dummies(sep=',').add_prefix('writer_')

    reduced_imdb['Director'] = reduced_imdb['Director'].apply(paranthesis_remover)
    ohe_directors = reduced_imdb.Director.str.get_dummies(sep=',').add_prefix('director_')
    
    # join the different One-hot encodings together
    reduced_imdb = reduced_imdb.join(ohe_languages, lsuffix='caller', rsuffix='other')
    reduced_imdb = reduced_imdb.join(ohe_actors, lsuffix='caller', rsuffix='other')
    reduced_imdb = reduced_imdb.join(ohe_directors, lsuffix='caller', rsuffix='other')
    reduced_imdb = reduced_imdb.join(ohe_genres, lsuffix='caller', rsuffix='other')
    reduced_imdb = reduced_imdb.join(ohe_writers, lsuffix='caller', rsuffix='other')
    
    # drop extra columns
    reduced_imdb.drop(columns=['Actors', 'Language', 'Writer', 'Genre', 'Director', 'Unnamed: 0', 'Awards', 'Plot', 'Error', 'Poster'])

In [None]:
imdb_data = one_hot_encoder('raw_data/imdb_budget.csv')