In [None]:
import seaborn as sns 
import pandas as pd
import matplotlib.pyplot as plt 
from tqdm import tqdm
import numpy as np 
import scipy.stats as sci
from collections import defaultdict
import matplotlib.ticker as ticker

pd.options.display.max_rows = 100
pd.options.display.max_columns = None
%matplotlib inline 

## Parsing Functions

## IMBD Revenue/Budget
- [23m Dataset](https://www.kaggle.com/juzershakir/tmdb-movies-dataset)
 - Used for Gathering Revenue, Profit, and Budget

In [None]:
def budget_numeric(df):
    '''This function will make the budget column in the dataframe numeric'''
    new_col = []
    new_df = df[[i for i in df.columns.to_list() if i!= 'budget']]
    col_val = np.array(df[f'budget'])
    for item in col_val: 
        if '.jpg' in item: 
            new_col.append(None)
        else: 
            new_col.append(float(item))
    new_df['budget'] = new_col
    return new_df

def profits(df):
    '''This function will create a new column call profit by subtracting the revenue from the budget'''
    new_df = df
    new_df['profit'] = round(new_df.revenue - new_df.budget,2)
    return new_df

def dates_func(x):
    '''This function will format each x from xx-xx-xxxx to xxxx'''
    if type(x) != str: 
        return None
    else: 
        return x.split('-')[0]
def dates(df): 
    '''This function formats the date column of the dataframe so that it matches the other dataframes'''
    dates = df.release_date 
    df.drop(['release_date'], axis = 1, inplace = True)
    df['date'] = dates.map(dates_func)
    return df

def genres_one_hot(df):
    '''This function parses the genres columns and creates a new column containing only the main identifier.
    This function will also create seperate columns for each genre and put a 1 or 0 depending on if it is present
    One-Hot-Encoding'''
    genre_list = df.genres.values
    genres = defaultdict(int)
    new_genre = []
    for test,i in enumerate(genre_list): 
        
        if i[0] == '[' and i[1] == ']': 
            continue 
        dic_split = i.split('} ')
        for d in (dic_split):
            name_loc = d.find("'name': ")
            new_d= d[name_loc +9:]
            new_d = new_d[:new_d.find("'")].strip()
            #new_d = new_d[:new_d.find("'")].strip()
            genres[new_d]+=1
    for g in tqdm(genres.keys()): 
        df[f'{g}'] = [0 for i in range(len(df))]
    for pos in tqdm(range(len(df))): 
        i = df.iloc[pos].genres
        if i[0] == '[' and i[1] == ']': 
            new_genre.append(None)
            continue 
        dic_split = i.split('} ')
        for idx, d in enumerate(dic_split): 
            name_loc = d.find("'name': ")
            new_d= d[name_loc +9:]
            new_d = new_d[:new_d.find("'")].strip()
            if idx == 0:
                new_genre.append(new_d)
            df.iloc[pos, df.columns.get_loc(f'{new_d}')] = 1
    df['genre'] = new_genre
    #df.drop('genres', axis =1, inplace = True)
    df.to_csv('../UsedData/23mRevenueNEW.csv', index = False)
    return df

        

ratings_25m = pd.read_csv('../../data/m_23/movies_metadata.csv')
col = ['imdb_id', 'title', 'release_date', 'budget', 'revenue', 'genres', 'adult', 'overview']
revenue_df = ratings_25m[[i for i in col]]
revenue_df = genres_one_hot(dates(profits(budget_numeric(revenue_df))))
revenue_one_hot = genres_one_hot(revenue_df)

In [None]:
revenue_one_hot.head(3)

### MovieLens Parsing
- [MovieLens](https://grouplens.org/datasets/movielens/)
- Used to gather user interactions 

In [None]:
def dates(x):
    '''this function reformats x to the proper date'''
    num = '1 2 3 4 5 6 7 8 9 0'.split()
    if x.strip()[-1] != ')' or x.strip()[-2] not in num: 
        return None
    else: 
        return x.strip()[-5:-1]

def title(x):
    '''this function reformats the title'''
    num = '1 2 3 4 5 6 7 8 9 0'.split()
    x = x.strip()
    if x[-1] != ')' or x[-2] not in num: 
        return x
    else: 
        return x[:-5]

def sort_genres(df):
    '''This function will sort the genres in movielens to one hot encoding '''
    genre_dict = defaultdict(int)
    genre_list = np.array(df.genres)
    for genre in tqdm(genre_list): 
        g_split = genre.split('|')
        for i in g_split:
            genre_dict[i]+=1
    for g in tqdm(genre_dict.keys()): 
        df[f'{g}'] = [0 for i in range(len(df))]
    for pos in tqdm(range(len(df))): 
        row = df.iloc[pos]
        g_split = row['genres'].split('|')
        for i in g_split: 
            df.iloc[pos, df.columns.get_loc(f'{i}')] = 1
    df.to_csv('../UsedData/MovieLensMoviesCleaned.csv', index = False)
    return df

movie_lens = pd.read_csv('../../data/MovieLens/movies.csv')
movie_lens['date'] = movie_lens.title.map(dates)
title_s = movie_lens['title']
movie_lens.drop(['title'], axis = 1, inplace = True)
movie_lens['title'] = title_s.map(lambda x: x[:-6].strip())
movie_lens = movie_lens[['movieId', 'title', 'date', 'genres']].sort_values(by = ['date'], ascending = False)
movie_lens_sorted = sort_genres(movie_lens)
print(movie_lens.shape)
print(movie_lens.columns)

In [None]:
movie_lens.head(1)

## Graphing Functions

In [None]:
#creates boxplot
def box_plot(df, year, column): 
    '''This function will create a boxplot for a given column across a range of years 
    
    Parameters
    ----------
    df = Movie DataFrame 
    year = the year to start the box plot  [2015 will create a boxplot from 2015-2017 with each row as a seperate year]'
    column = column for the boxplot'''
    

    sns.set(font_scale = 2, style = 'whitegrid', rc = {'grid.linewidth': .5})
    genre_dic = defaultdict(list)
    year_dic = defaultdict(list)
    years = list(range(year, 2021))
    df = df[(df.revenue > 0) & (df.budget > 0)& (df.date >= year)] # (df.date.isin(years))]
    genre_list = 'Adventure Fantasy Animation Horror Action Crime Comedy Thriller War Romance'.split()
    new_df = pd.DataFrame(columns = ['title', 'date', 'genre', 'budget', 'revenue', 'profit'])
    for g in genre_list: 
        g_df = df[df[f'{g}'] == 1][['title', 'date', 'budget', 'revenue', 'profit']]
        g_df['genre'] = [g for i in range(len(g_df))]
        new_df = new_df.append(g_df, ignore_index = True)
    ax = sns.catplot(x = 'genre', y = 'profit', 
                     data = new_df, kind = 'box', height =10, aspect =2, legend_out = True, color = 'deepskyblue',
                    width = .5) 
    ax.set(ylabel = 'Profit (hundred-millions)', xlabel = 'Genres')
    plt.title(f'Box Plot For Popular Genres from {year}-2017')
    plt.savefig(f'../PlotImages/BoxPlot_genres_{year}.png')
    plt.show(ax)


#   

merged_df = pd.read_csv('../UsedData/Merged_ratings_movies_revenue.csv')
merged_df = merged_df[(merged_df.revenue > 0) & (merged_df.budget >0)] #making sure to use movies with a revenue and budget
box_plot(merged_df, 2010, 'profit')

In [None]:
def scatter_sized(df, year, pos_neg = False): 
    '''This function will create a scatter plot for each genre'''
    df = df[(df.date >= year) & (df.budget > 0) & (df.revenue > 0)]
    
    fig, ax = plt.subplots(figsize = (10,8))
    count = df[(df.profit < 0) | (df.profit > 100000000)].genre.value_counts()    


    if pos_neg == 'positive':
        df = df[df.profit > 100000000]
        pos_neg = 'POSITIVE'
        s_factor = 1000000
        tick_color = 'green'
    elif pos_neg == 'negative': 
        df = df[df.profit < 0]
        pos_neg = 'NEGATIVE'
        s_factor = 100000
        tick_color = 'red'
    else: 
        df = df[(df.profit < 0) | (df.profit > 100000000)]
        pos_neg = 'COMBINED'
        s_factor = 1000000
        tick_color = 'black'
    color = pd.Categorical(df.genre.value_counts().index.to_list()).codes
    num_dic = {'#eeefff':'Action', '#F1F906': 'Drama', '2': 'Comedy', '3': 'Adventure', 
                 '4': 'Horror', '5': 'Thriller', '6': 'Crime', '7': 'Family', 
                 '8': 'Animation', '9': 'Romance', '10': 'Mystery', 
                 '11': 'Fantasy', '12': 'War', '13': 'Science Fiction'}
    genre_dic = {value:key for key, value in num_dic.items()}
    for genre in genre_dic.keys():
        genre_count = df.genre.value_counts()
        if genre in genre_count.index.to_list(): 
            g_count = genre_count[genre]
        else: 
            g_count = 0
        new_df = df[df.genre == genre]
        budget = new_df.budget /1000000
        profit = new_df.profit / 1000000
        s = new_df.revenue / s_factor
        if len(budget) < 2 or len(profit) < 2: 
            continue
        pearson = round(sci.stats.pearsonr(budget, profit)[0],3)
        if genre in ['Action', 'Comedy', 'Adventure', 'Drama']: 
            marker = 'H'
        else: 
            marker = 'o'
        perc = round((g_count / count[genre]*100), 2) 
        ax.scatter(budget, profit, cmap = genre_dic[genre], s = s, label = f'{genre} ({perc}%) [r = {pearson}]', 
                   alpha = .5, marker = marker)
    lgnd = plt.legend(bbox_to_anchor = (1.05, 1.0), loc = 'upper left')
    for i in range(len(genre_dic.keys())): 
        try:
            lgnd.legendHandles[i]._sizes = [500]
        except: 
            break
    formatter = ticker.StrMethodFormatter('${x:,.0f}')
    ax.yaxis.set_major_formatter(formatter)
    ax.xaxis.set_major_formatter(formatter)
    for ytick, xtick in zip(ax.yaxis.get_major_ticks(), ax.xaxis.get_major_ticks()): 
        ytick.label1.set_visible(True)
        ytick.label2.set_visible(False)
        ytick.label1.set_color('black')  
        xtick.label1.set_visible(True)
        xtick.label2.set_visible(False)
    
    plt.title(f'({pos_neg.upper()}) | Comparing Profit(y), Budget(x), and Revenue(size) by Genres (color) [{year}-2019]',
             fontname = 'Arial')
    plt.xlabel('Movie Budget (millions)'); plt.ylabel('Movie Profit(millions)', fontname = 'Arial')
    plt.grid()
    plt.savefig(f'../PlotImages/profit_budget_revenue_{pos_neg}[{year}].png', bbox_inches = 'tight')

scatter_sized(merged_df, 2015, pos_neg = 'combined')

In [None]:
def budget_line_plot(df, year, column, genre_list): 
    
    num_dic = {'#eeefff':'Action', '#F1F906': 'Drama', '2': 'Comedy', '3': 'Adventure', 
                 '4': 'Horror', '5': 'Thriller', '6': 'Crime', '7': 'Family', 
                 '8': 'Animation', '9': 'Romance', '10': 'Mystery', 
                 '11': 'Fantasy', '12': 'War', '13': 'Science Fiction'}
    genre_dic = {value:key for key, value in num_dic.items()}
    
    fig, ax = plt.subplots(figsize=(18,8))


    df = df[(df.date >= year) & (df.budget > 0) & (df.revenue > 0) & (df.profit) > 0]
    new_df = pd.DataFrame(columns = ['date', 'genre', 'avg_profit'])
    for g in genre_list: 
        g_df = df[df[f'{g}'] == 1][['date', 'budget']].groupby('date').budget.mean()
        for i in g_df: 
            g_dic = {'date': [i for i in g_df.index], 'genre': [g for i in range(len(g_df))],
                    'avg_budget': g_df.values}
            g_dic = pd.DataFrame(g_dic)
            new_df = new_df.append(g_dic, ignore_index = True)
    for g in genre_list: 
        g_df = new_df[new_df.genre == g].sort_values(by = 'date')
        x = g_df.date.values 
        y = g_df.avg_budget 
        pearson = round(sci.stats.pearsonr(x, y)[0],3)
        ax.plot(x,y,label = f'{g} (r = {pearson})', marker = 'o')
    lgnd = plt.legend(bbox_to_anchor = (1.05, 1.0), loc = 'upper left')
    formatter = ticker.StrMethodFormatter('${x:,.0f}')
    ax.yaxis.set_major_formatter(formatter)
    for ytick, xtick in zip(ax.yaxis.get_major_ticks(),ax.xaxis.get_major_ticks()): 
        ytick.label1.set_visible(True)
        ytick.label2.set_visible(False)
        ytick.label1.set_color('black')
        ytick.label1.set_fontsize(14)
        xtick.label1.set_fontsize(14)
    lgnd = plt.legend(bbox_to_anchor = (1.05, 1.0), loc = 'upper left',
                     prop = {'size': 20})
    for i in range(len(genre_dic.keys())): 
        try:
            lgnd.legendHandles[i]._sizes = [500]
        except: 
            break
    plt.title(f'Change in Average Budget for Profitable Movies Since {year}', {'size':16})
    plt.xlabel('Year', {'size':20})
    plt.ylabel('Mean Budget', {'size':20})
    plt.savefig('GitImages/change_over_time_budget.png', bbox_inches = 'tight')
merged_df = pd.read_csv('UsedData/Merged_ratings_movies_revenue.csv')          
budget_line_plot(merged_df, 2010, 'profit', 'Adventure Action Comedy Horror'.split())

# Start of Michael's Notebook

### Importing and formatting

In [None]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
font = {'family' : 'DejaVu Sans',
        'weight' : 'bold',
        'size'   : 19}
matplotlib.rc('font', **font) #Formatting plots to be more legible


## Defining useful methods

In [None]:
def initialClean(dataset):   #Initial cleanup of missing/NaN Data
    subSet = dataset
    subSet = subSet.loc[subSet['revenue']>0,:].loc[(subSet['budget']>100),:].loc[(subSet['title']!="The Tiger: An Old Hunter's Tale")].loc[(subSet.profit!=0),:].loc[subSet.date>=2010]
    subSet['ROI']=np.array(subSet.profit)/np.array(subSet.budget)
    subSet.sort_values('ROI',ascending=False,inplace=True)
    return subSet
    

def getGenreSet(dataset, genre):    #subsetting rows by genres
    return dataset.loc[dataset[genre]==1]
    
    
    
def getPosNegSubset(dataset,x=[1,-1]):       #Subsetting rows by either positive or negative ROI
    if (x==1):
        return dataset.loc[dataset.ROI>=0]
    elif (x==(-1)):
        return dataset.sort_values('ROI',ascending=False).loc[dataset.ROI<0]
        
        
def plotHists(dataframes, column, titles):      #Histogram Plotter
    for i,dataframe in enumerate(dataframes):
        plt.subplot((len(dataframes)/2 + len(dataframes)%2),2,i+1)
        color = 'darkgreen'
        if i%2==1:
            color = 'darkred'
        n, bins, patches=plt.hist(dataframe[column],int(dataframe[column].describe()[0]/10),(dataframe[column].min(),dataframe[column].mean()+dataframe[column].std()+.5),color=color)
        y = np.linspace(0,max(n),50)
        xmean = np.full((50,1),dataframe[column].mean())
        xStdLow = np.full((50,1),dataframe[column].mean()-dataframe[column].std())
        xStdHigh = np.full((50,1),dataframe[column].mean()+dataframe[column].std())
        plt.plot(xmean,y, label="Mean")
        plt.plot(xStdLow,y,ls='--',color='lightblue',label="Upper, Lower Stdev")
        plt.plot(xStdHigh,y,ls='--',color='lightblue')
        plt.title(titles[i])
        plt.xlabel('Return on Investment')
        plt.ylabel('Frequency (# of movies)')
        plt.legend(loc=1)
        freq = 1
        if len(bins)>60:
            freq = 3
        ticks = plt.xticks(np.arange(int(min(bins)), int(max(bins))+1, freq))
        
def plotScatter(dataset,name):     #Scatter plotter
    figScatter = plt.figure(figsize=(20,20))
    dataSubSet = dataset.loc[dataset.ROI>=0]
    plt.scatter(dataSubSet.budget,dataSubSet.revenue,color='darkgreen',label='n='+str(dataSubSet.shape[0]))
    dataSubSet = dataset.loc[dataset.ROI<0]
    plt.scatter(dataSubSet.budget,dataSubSet.revenue,color='darkred',label='n='+ str(dataSubSet.shape[0]))
    plotlimit = dataset.profit.max()
    plt.plot(np.linspace(0,plotlimit),np.linspace(0,plotlimit))
    plt.xlim(0,plotlimit)
    plt.ylim(0,plotlimit)
    plt.plot(np.linspace(dataSubSet.budget.max(),dataSubSet.budget.max(),500),np.linspace(0,plotlimit,500), color='darkorange',ls='--',lw=2,label='Lower budget bound = $'+str(round(dataSubSet.budget.max()/1e9,3)) +" Billion Dollars")
    plt.title('Budget vs Revenue in ' + name + ' Movies (2010-2020)')
    plt.xlabel('Budget in Billions ($)')
    plt.ylabel('Revenue in Billions ($)')
    plt.legend(loc=1)
    figScatter.savefig('mwangPlotImages/'+name+'Scatter.png')
    return dataSubSet.budget.max(), dataSubSet.shape[1]/(dataSubSet.shape[0]+dataSubSet.shape[1])*100
    

Initial import of Sam's clean merged dataset, further cleaning and exploration into this main set

In [None]:
merged=initialClean(pd.read_csv("UsedData/Merged_ratings_movies_revenue.csv"))
merged

In [None]:
genre_list = np.array(merged.columns[3:16])

# Compiling subsets for plotting

In [None]:
comedy_df = getGenreSet(merged,'Comedy')
romance_df = getGenreSet(merged,'Romance')
drama_df = getGenreSet(merged,'Drama')
children_df = getGenreSet(merged,'Children')
crime_df = getGenreSet(merged,'Crime')
mystery_df = getGenreSet(merged,'Mystery')
action_df = getGenreSet(merged,'Action')
documentary_df = getGenreSet(merged,'Documentary')
fantasy_df = getGenreSet(merged,'Fantasy')
thriller_df = getGenreSet(merged,'Thriller')
horror_df = getGenreSet(merged,'Horror')
animation_df = getGenreSet(merged,'Animation')
adventure_df = getGenreSet(merged,'Adventure')

genre_df_list = [comedy_df,romance_df,drama_df,children_df,crime_df,mystery_df,action_df,documentary_df,fantasy_df,thriller_df,horror_df,animation_df,adventure_df]
posROI_df = getPosNegSubset(merged,x=1)
negROI_df = getPosNegSubset(merged,x=-1)

In [None]:
genrelist = [action_df,drama_df,adventure_df,animation_df]
titles = ['Frequency of ROI in Action movies (N='+ str(len(genrelist[0].ROI))+')',
         'Frequency of ROI in Drama movies (N='+ str(len(genrelist[1].ROI))+')',
         'Frequency of ROI in Adventure movies (N='+ str(len(genrelist[2].ROI))+')',
         'Frequency of ROI in Animation movies (N='+ str(len(genrelist[3].ROI))+')',]
figHist = plt.figure(figsize=(30,20))
plotHists(genrelist,'ROI',titles)
figHist.savefig('mwangPlotImages/genreHistogram.png')

Subplotting 4 genres that caught my attention by ROI frequency distributions

In [None]:
genrelist = [action_df,drama_df,adventure_df,animation_df]
titles = ['Frequency of ROI in Action movies (N='+ str(len(genrelist[0].ROI))+')',
         'Frequency of ROI in Drama movies (N='+ str(len(genrelist[1].ROI))+')',
         'Frequency of ROI in Adventure movies (N='+ str(len(genrelist[2].ROI))+')',
         'Frequency of ROI in Animation movies (N='+ str(len(genrelist[3].ROI))+')',]
figHist = plt.figure(figsize=(30,20))
plotHists(genrelist,'ROI',titles)

Subplotting positive and negative ROI frequency distributions

In [None]:
titles = ['Frequency Distribution of Profitable Movies (N='+ str(len(posROI_df.ROI))+')',
         'Frequency Distribution of Non-Profitable Movies (N='+ str(len(negROI_df.ROI))+')']
figPosNegHist = plt.figure(figsize=(30,10))
plotHists([posROI_df,negROI_df],'ROI',titles)
figPosNegHist.savefig('mwangPlotImages/PosNegHist.png')

Peeking at distribution of data to see if we can see any trends

In [None]:
print('POSITIVE \n' + str(posROI_df.ROI.describe())+'\n')
print('NEGATIVE \n' + str(negROI_df.ROI.describe()))

# Creating a scatter of Revenue vs budget of Action movies in last 10 years

In [None]:
figScatter = plt.figure(figsize=(27,20))
actionsub_df = action_df.loc[action_df.ROI>=0]
plt.scatter(actionsub_df.budget,actionsub_df.revenue,color='darkgreen',label='n='+str(actionsub_df.shape[0]))
actionsub_df = action_df.loc[action_df.ROI<0]
plt.scatter(actionsub_df.budget,actionsub_df.revenue,color='darkred',label='n='+ str(actionsub_df.shape[0]))
plt.plot(np.linspace(0,1.6e9),np.linspace(0,1.6e9))
plt.xlim(0,1.6e9)
plt.ylim(0,1.6e9)
plt.plot(np.linspace(actionsub_df.budget.max(),actionsub_df.budget.max(),500),np.linspace(0,1.6e9,500), color='darkorange',ls='--',lw=2,label='Lower budget bound = $'+str(round(actionsub_df.budget.max()/1e9,3)) +" Billion Dollars")
plt.title('Budget vs Revenue in Action Movies (2010-2020)')
plt.xlabel('Budget in Billions ($)')
plt.ylabel('Revenue in Billions ($)')
plt.legend(loc=1)
figScatter.savefig('mwangPlotImages/ActionScatter.png')



# Seeing how it compares against animation

In [None]:
figActionAnimation = plt.figure(figsize=(40,20))
plt.subplot(1,2,1)
actionsub_df = action_df.loc[action_df.ROI>=0]
plt.scatter(actionsub_df.budget,actionsub_df.revenue,color='darkgreen',label='n='+str(actionsub_df.shape[0]))
actionsub_df = action_df.loc[action_df.ROI<0]
plt.scatter(actionsub_df.budget,actionsub_df.revenue,color='darkred',label='n='+ str(actionsub_df.shape[0]))
plt.plot(np.linspace(0,1.6e9),np.linspace(0,1.6e9))
plt.xlim(0,1.6e9)
plt.ylim(0,1.6e9)
plt.plot(np.linspace(actionsub_df.budget.max(),actionsub_df.budget.max(),500),np.linspace(0,1.6e9,500), color='darkorange',ls='--',lw=2,label='Lower budget bound = $'+str(round(actionsub_df.budget.max()/1e9,3)) +" Billion Dollars")
plt.title('Budget vs Revenue in Action Movies (2010-2020)')
plt.xlabel('Budget in Billions ($)')
plt.ylabel('Revenue in Billions ($)')
plt.legend(loc=1)

#Second subplot
plt.subplot(1,2,2)
plt.xlim(0,1.3e9)
plt.ylim(0,1.3e9)
animationsub_df = animation_df.loc[animation_df.ROI>=0]
plt.scatter(animationsub_df.budget,animationsub_df.revenue,color='darkgreen',label='n='+str(animationsub_df.shape[0]))
animationsub_df = animation_df.loc[animation_df.ROI<0]
plt.scatter(animationsub_df.budget,animationsub_df.revenue,color='darkred',label='n='+ str(animationsub_df.shape[0]))
plt.plot(np.linspace(0,1.3e9),np.linspace(0,1.3e9))
plt.plot(np.linspace(animationsub_df.budget.max(),animationsub_df.budget.max(),500),np.linspace(0,1.3e9,500), color='darkorange',ls='--',lw=2,label='Lower budget bound = $'+str(animationsub_df.budget.max()/1e9) +" Billion Dollars")
plt.title('Budget vs Revenue in Animation Movies (2010-2020)')
plt.xlabel('Budget in Billions ($)')
plt.ylabel('Revenue in Billions ($)')
plt.legend(loc=1)

figActionAnimation.savefig('mwangPlotImages/ActionAnimationScatter.png')

## Plotting one scatter independantly to explore shape

In [None]:
figScatter = plt.figure(figsize=(20,20))
actionsub_df = action_df.loc[action_df.ROI>=0]
plt.scatter(actionsub_df.budget,actionsub_df.revenue,color='darkgreen',label='n='+str(actionsub_df.shape[0]))
actionsub_df = action_df.loc[action_df.ROI<0]
plt.scatter(actionsub_df.budget,actionsub_df.revenue,color='darkred',label='n='+ str(actionsub_df.shape[0]))
plt.plot(np.linspace(0,1.6e9),np.linspace(0,1.6e9))
plt.xlim(0,1.6e9)
plt.ylim(0,1.6e9)
plt.plot(np.linspace(actionsub_df.budget.max(),actionsub_df.budget.max(),500),np.linspace(0,1.6e9,500), color='darkorange',ls='--',lw=2)
plt.title('Budget vs Revenue in Action Movies (2010-2020)')
plt.xlabel('Budget in Billions ($)')
plt.ylabel('Revenue in Billions ($)')
plt.legend(loc=1)
figScatter.savefig('mwangPlotImages/ActionScatter.png')




Checking out which subsets have more datapoints, seeing if more datapoints correlate to more accurate predictions etc.

In [None]:
df = merged.loc[:,'Comedy':'Sci-Fi']
for x in df:
    print(x + " data points = " + str(df[x].value_counts()[1]))

# Scatter plotting all genres and gathering useful data from plots

In [None]:
aggArr = np.zeros((len(genre_df_list),2))
for i,genre in enumerate(genre_df_list):
    budgetLim, successRate = plotScatter(genre,genre.columns[i+3])
    aggArr[i][0] = budgetLim
    aggArr[i][1] = successRate


Compiling a Dataframe of minimum suggested budgets & respective success rate on each genre to visualize, make suggestions

In [None]:
genreBudgetDict = dict(zip(genre_list,aggArr)) 
genresAgg_df = pd.DataFrame.from_dict(genreBudgetDict,orient='index',columns=['Min_Req_Budget','Success_Rate'])
print(genresAgg_df)