In [1]:
import pandas as pd
import numpy as np
import os

#### Formats the data set

In [2]:
df_reviews = pd.read_csv(os.getcwd() + '/../data_raw/reviews.csv')

df_reviews['bo_domestic'] = df_reviews['bo_domestic'].str.replace(',', '')
df_reviews['bo_foreign'] = df_reviews['bo_foreign'].str.replace(',', '')
df_reviews['bo_domestic'] = df_reviews['bo_domestic'].map(str)
df_reviews['bo_foreign'] = df_reviews['bo_foreign'].map(str)

# saves off the gross for later
df_gross = df_reviews.loc[:, ['movie', 'bo_domestic', 'bo_foreign']]
df_reviews = df_reviews.drop(['bo_domestic', 'bo_foreign'], axis=1)

for c in ['imdb_18', 'imdb_18_29', 'imdb_30_44', 'imdb_45']:
    df_reviews[c] = df_reviews[c] * 10

df_reviews = pd.melt(df_reviews, id_vars='movie')
max_categories = [['imdb_18', 'imdb_18_29', 'imdb_30_44', 'imdb_45'],
                  ['rt_all_critics', 'rt_top_critics', 'rt_audience_score']]    

df_reviews['max'] = 0
for category in max_categories:
    max_name = category[0][:2] + '_max'
    for movie in df_reviews['movie'].unique():
        max_value = max(df_reviews.loc[(df_reviews['movie'] == movie) & (df_reviews['variable'].isin(category)), 'value'])
        df_reviews.loc[(df_reviews['movie'] == movie) &
                       (df_reviews['variable'].isin(category)) &
                       (df_reviews['value'] == max_value), 'max'] = 1

#### This does some trig to get the x and y for the radar char

In [3]:
radius = 100
n_spokes = 7
df_reviews['x_radar'] = 0
df_reviews['y_radar'] = 0
df_reviews['index_radar'] = 0
added_rows = []
for i, row in df_reviews.iterrows():
    if row['variable'] == 'imdb_18':
        spoke = 5
    elif row['variable'] == 'imdb_18_29':
        spoke = 4
    elif row['variable'] == 'imdb_30_44':
        spoke = 3
    elif row['variable'] == 'imdb_45':
        spoke = 2
    elif row['variable'] == 'rt_all_critics':
        spoke = 6
    elif row['variable'] == 'rt_top_critics':
        spoke = 0
    elif row['variable'] == 'rt_audience_score':
        spoke = 1
    else:
        continue

    if spoke - 1 == n_spokes:
        spoke_next = 0
    else:
        spoke_next = spoke + 1
        
    df_reviews.loc[i, 'index_radar'] = 0
    
    df_reviews.loc[i, 'y_radar'] = np.cos(spoke * (360/n_spokes) * np.pi / 180) * row['value']
    df_reviews.loc[i, 'x_radar'] = np.sin(spoke * (360/n_spokes) * np.pi / 180) * row['value']
    y_radar_2 = np.cos(spoke_next * (360/n_spokes) * np.pi / 180) * row['value']
    x_radar_2 = np.sin(spoke_next * (360/n_spokes) * np.pi / 180) * row['value']

    
    added_rows.append([row['movie'], row['variable'], row['value'], row['max'], x_radar_2, y_radar_2, 1])
    added_rows.append([row['movie'], row['variable'], row['value'], row['max'], 0, 0, 2])

df_reviews = df_reviews.append(pd.DataFrame(added_rows, columns=['movie', 'variable', 'value', 'max', 'x_radar', 'y_radar', 'index_radar']))

# creates a second index value 
df_reviews = df_reviews.sort_values(['movie', 'variable', 'index_radar']).reset_index(drop=True)
df_reviews['index_radar_2'] = 1
df_reviews['index_radar_2'] = df_reviews.groupby(['movie'])['index_radar_2'].cumsum()
df_reviews.to_csv(os.getcwd() + '/../data_raw/df_reviews_2.csv', index=False)

#### Combines the box office data

In [4]:
list_file = ['black_panther', 'blackkklansman', 'bohemian_rhapsody', 'the_favourite', 
             'green_book', 'a_star_is_born', 'vice', 'roma']
wanted_columns = ['country', 'total_gross', 'movie']

df_box_office = None
for file in list_file:
    # reads in each file
    df_temp = pd.read_csv(os.getcwd() + '/../data_raw/' + file + '.csv',
                          names=['country', 'dist', 'release_date', 'opening_weekend', 'percnt_total', 'total_gross', 'as_of'])

    # formats the gross as an int
    df_temp['total_gross'] = df_temp['total_gross'].str.replace(',', '')
    df_temp['total_gross'] = df_temp['total_gross'].str.replace('$', '')
    
    # gets and appends the domestic gross
    domestic_gross = df_gross.loc[df_gross['movie'] == file.replace('_', ' '), 'bo_domestic'].values[0]
    df_new_row = pd.DataFrame([['United States', '', '', '', '', '', domestic_gross]], 
                              columns=['country', 'dist', 'release_date', 'opening_weekend', 'percnt_total', 'as_of', 'total_gross'])    
    df_temp = df_temp.append(df_new_row)

    # adds the film's name
    df_temp['movie'] = file

    # appends all results
    if df_box_office is None:
        df_box_office = df_temp.copy()
    else:
        df_box_office = df_box_office.append(df_temp)
    
    df_box_office = df_box_office.loc[~df_box_office['country'].isnull(), :].reset_index(drop=True)
    df_box_office['total_gross'] = df_box_office['total_gross'].map(int)

df_box_office = df_box_office.loc[:, wanted_columns]


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


#### Creates the x and y values for the radar chart

In [5]:
countries = ['France', 'South Korea', 'Japan', 'Australia', 'United Kingdom', 'United States', 'China']
df_box_office = df_box_office.loc[df_box_office['country'].isin(countries), ['movie', 'country', 'total_gross']].reset_index(drop=True)
df_box_office['total_gross_log'] = np.log(df_box_office['total_gross'])

# adds any missing countries
new_rows = []
for country in countries:
    for movie in df_box_office['movie'].unique():
        if df_box_office.loc[(df_box_office['country'] == country) &
                             (df_box_office['movie'] == movie), :].shape[0] == 0:
            new_rows.append([movie, country, 0])

df_box_office = df_box_office.append(pd.DataFrame(new_rows, columns=['movie', 'country', 'total_gross_log']))
df_box_office = df_box_office.sort_values('movie').reset_index(drop=True)

n_spokes = 7
spoke_next = 0
df_box_office['x_radar'] = 0
df_box_office['y_radar'] = 0
df_box_office['index_radar'] = 0
added_rows = []
for i, row in df_box_office.iterrows():
    if row['country'] == countries[0]:
        spoke = 5
    elif row['country'] == countries[1]:
        spoke = 4
    elif row['country'] == countries[2]:
        spoke = 3
    elif row['country'] == countries[3]:
        spoke = 2
    elif row['country'] == countries[4]:
        spoke = 6
    elif row['country'] == countries[5]:
        spoke = 0
    elif row['country'] == countries[6]:
        spoke = 1
    else:
        continue

    if spoke == n_spokes:
        spoke_next = 0
    else:
        spoke_next = spoke + 1
        
    df_box_office.loc[i, 'index_radar'] = 0
    df_box_office.loc[i, 'y_radar'] = np.cos(spoke * (360/n_spokes) * np.pi / 180) * row['total_gross_log']
    df_box_office.loc[i, 'x_radar'] = np.sin(spoke * (360/n_spokes) * np.pi / 180) * row['total_gross_log']
    y_radar_2 = np.cos(spoke_next * (360/n_spokes) * np.pi / 180) * row['total_gross_log']
    x_radar_2 = np.sin(spoke_next * (360/n_spokes) * np.pi / 180) * row['total_gross_log']

    added_rows.append([row['country'], row['movie'], row['total_gross'], 
                       row['total_gross_log'], x_radar_2, y_radar_2, 1])
    added_rows.append([row['country'], row['movie'], row['total_gross'], 
                       row['total_gross_log'], 0, 0, 2])

df_box_office = df_box_office.append(pd.DataFrame(added_rows,
                                                  columns=['country', 'movie', 'total_gross',
                                                           'total_gross_log', 'x_radar', 'y_radar',
                                                           'index_radar']))

# creates a second index value 
df_box_office = df_box_office.sort_values(['movie', 'country', 'index_radar']).reset_index(drop=True)
df_box_office['index_radar_2'] = 1
df_box_office['index_radar_2'] = df_box_office.groupby(['movie'])['index_radar_2'].cumsum()

df_box_office.to_csv(os.getcwd() + '/../data_raw/df_box_office_2.csv', index=False)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,country,movie,total_gross,total_gross_log,x_radar,y_radar,index_radar,index_radar_2
0,Australia,a_star_is_born,24815287.0,17.02697,16.600069,-3.788857,0,1
1,Australia,a_star_is_born,24815287.0,17.02697,7.387726,-15.34077,1,2
2,Australia,a_star_is_born,24815287.0,17.02697,0.0,0.0,2,3
3,China,a_star_is_born,,0.0,0.0,0.0,0,4
4,China,a_star_is_born,,0.0,0.0,-0.0,1,5
