In [4]:
# Objective: 
#           To see what correlations exist between Oscar Nominated films
# Questions:
#           Are particular genres more likely to be nominated for Oscars?
#           Is there a trend in plot keywords and films that are nominated for Oscars?
# Datasets: 
#           IMDB Top 5000 Movies
#           The Academy Awards 1925-2015

In [5]:
import pandas as pd
import numpy as np
import patsy
import statsmodels.api as sm
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt


In [6]:
#oscars dataset from 1927-2015
oscars = pd.read_csv('database.csv')
#imdb top 5000 dataset
imdb = pd.read_csv('movie_metadata.csv', encoding = "ISO-8859-1")

In [7]:
#film names had two junk characters appended at the end-cleaned up
imdb['Film'] = imdb['Film'].str[:-2]

In [8]:
#picked pertinent selectors for both datasets
imdb = imdb[['director_name', 'Film', 'gross', 'genres', 'duration', 'plot_keywords']]
oscars = oscars[['Year', 'Award', 'Name', 'Film']]

In [9]:
#used film name to merge the two datasets
joined = pd.merge(imdb, oscars, on="Film")

In [10]:
joined.dropna()

Unnamed: 0,director_name,Film,gross,genres,duration,plot_keywords,Year,Award,Name
0,James Cameron,Titanic,658672302.0,Drama|Romance,194.0,artist|love|ship|titanic|wet,1997,Actress in a Leading Role,Kate Winslet
1,James Cameron,Titanic,658672302.0,Drama|Romance,194.0,artist|love|ship|titanic|wet,1997,Actress in a Supporting Role,Gloria Stuart
2,Christopher Nolan,The Dark Knight,533316061.0,Action|Crime|Drama|Thriller,152.0,based on comic book|dc comics|psychopath|star ...,2008,Actor in a Supporting Role,Heath Ledger
3,David Fincher,The Curious Case of Benjamin Button,127490802.0,Drama|Fantasy|Romance,166.0,deformed baby|diary|lingerie slip|older man yo...,2008,Actor in a Leading Role,Brad Pitt
4,David Fincher,The Curious Case of Benjamin Button,127490802.0,Drama|Fantasy|Romance,166.0,deformed baby|diary|lingerie slip|older man yo...,2008,Actress in a Supporting Role,Taraji P. Henson
5,Edward Zwick,The Last Samurai,111110575.0,Action|Drama|History|War,154.0,captain|emperor|honor|japan|samurai,2003,Actor in a Supporting Role,Ken Watanabe
6,Breck Eisner,Sahara,68642452.0,Action|Adventure|Comedy|Thriller,124.0,beach|civil war|desert|dictator|ship,1943,Actor in a Supporting Role,J. Carrol Naish
7,Alejandro G. IÌ±ÌÁrritu,The Revenant,183635922.0,Adventure|Drama|Thriller|Western,156.0,bear attack|cauterizing a wound|native america...,2015,Actor in a Leading Role,Leonardo DiCaprio
8,Alejandro G. IÌ±ÌÁrritu,The Revenant,183635922.0,Adventure|Drama|Thriller|Western,156.0,bear attack|cauterizing a wound|native america...,2015,Actor in a Supporting Role,Tom Hardy
9,Gore Verbinski,Pirates of the Caribbean: The Curse of the Bla...,305388685.0,Action|Adventure|Fantasy,143.0,caribbean|curse|governor|pirate|undead,2003,Actor in a Leading Role,Johnny Depp


In [11]:
%%HTML

<div class='tableauPlaceholder' id='viz1501801152816' style='position: relative'><noscript><a href='#'><img alt='Sheet 1 ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;At&#47;Attempt1_14&#47;Sheet1&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='site_root' value='' /><param name='name' value='Attempt1_14&#47;Sheet1' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;At&#47;Attempt1_14&#47;Sheet1&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1501801152816');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>


In [12]:
#Due to the way the dataset was organized, each genre is split up by a pipe for each individual film. This led to long 
#strings of genres that skewed the word cloud. To delve into which particular genres were most popular, I split the
#genres into individual genres.

In [13]:
#exported the film genres to a csv
genres = []
for row in joined['genres']:
   genres.extend(row.split('|'))
for genre in genres:
    print(genre)
    
genres.to_csv('genres.csv')


Drama
Romance
Drama
Romance
Action
Crime
Drama
Thriller
Drama
Fantasy
Romance
Drama
Fantasy
Romance
Action
Drama
History
War
Action
Adventure
Comedy
Thriller
Adventure
Drama
Thriller
Western
Adventure
Drama
Thriller
Western
Action
Adventure
Fantasy
Adventure
Drama
Sci-Fi
Thriller
Action
Drama
History
War
Action
Drama
History
War
Action
Drama
History
War
Action
Drama
History
War
Action
Drama
History
War
Biography
Drama
Biography
Drama
Biography
Drama
Action
Adventure
Drama
Fantasy
Biography
Drama
Sport
Biography
Drama
Sport
Adventure
Drama
Sci-Fi
Action
Drama
Romance
Action
Drama
Romance
Action
Adventure
Sci-Fi
Thriller
Action
Adventure
Sci-Fi
Thriller
Biography
Crime
Drama
Drama
Western
Adventure
Drama
Thriller
Adventure
Drama
Thriller
Biography
Comedy
Crime
Drama
Biography
Comedy
Crime
Drama
Crime
Drama
Drama
Fantasy
Thriller
Drama
Fantasy
Thriller
Drama
History
War
Western
Drama
History
War
Western
Crime
Drama
Thriller
Action
Comedy
Crime
Drama
Mystery
Thriller
Adventure
Drama
Histor

AttributeError: 'list' object has no attribute 'to_csv'

In [None]:
joined.to_csv('joined.csv')

In [128]:
#The cleaned up genre word cloud displayed a clear 
%%HTML
<div class='tableauPlaceholder' id='viz1501801299793' style='position: relative'><noscript><a href='#'><img alt='&lt;Genre Map&gt; ' src='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ge&#47;Genre_Map&#47;Sheet2&#47;1_rss.png' style='border: none' /></a></noscript><object class='tableauViz'  style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='site_root' value='' /><param name='name' value='Genre_Map&#47;Sheet2' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https:&#47;&#47;public.tableau.com&#47;static&#47;images&#47;Ge&#47;Genre_Map&#47;Sheet2&#47;1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /><param name='filter' value='publish=yes' /></object></div>                <script type='text/javascript'>                    var divElement = document.getElementById('viz1501801299793');                    var vizElement = divElement.getElementsByTagName('object')[0];                    vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px';                    var scriptElement = document.createElement('script');                    scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js';                    vizElement.parentNode.insertBefore(scriptElement, vizElement);                </script>

In [141]:
non_null = joined[joined['plot_keywords'].notnull()]
for lol in non_null['plot_keywords']:
    haha += lol.split('|')
for key_word in haha:
    print(key_word)
    
#key_word.to_csv('key_words.csv')

director_name
Film
gross
genres
duration
plot_keywords
Year
Award
Name
artist
love
ship
titanic
wet
artist
love
ship
titanic
wet
based on comic book
dc comics
psychopath
star died before release
urban setting
deformed baby
diary
lingerie slip
older man younger woman relationship
premature aging
deformed baby
diary
lingerie slip
older man younger woman relationship
premature aging
captain
emperor
honor
japan
samurai
beach
civil war
desert
dictator
ship
bear attack
cauterizing a wound
native american
revenge
survival
bear attack
cauterizing a wound
native american
revenge
survival
caribbean
curse
governor
pirate
undead
box office hit
long take
sole survivor
space
space station
american revolution
british
french
hero
standoff
american revolution
british
french
hero
standoff
american revolution
british
french
hero
standoff
american revolution
british
french
hero
standoff
american revolution
british
french
hero
standoff
1920s
aviation
fight
spruce goose
test flight
1920s
aviation
fight
spru