In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('default') # set style (colors, background, size, gridlines etc.) # ggplot
plt.rcParams['figure.figsize'] = 8, 4 # set default size of plots
plt.rcParams.update({'font.size': 18})

import scraping_class
logfile = 'log.txt' ## name your log file.
connector = scraping_class.Connector(logfile)

from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm

In [2]:
#pip install cpi

In [3]:
movies = pd.read_csv('IMDb movies.csv')
actors = pd.read_csv('IMDb names.csv')
ratings = pd.read_csv('IMDb ratings.csv')
role = pd.read_csv('IMDb title_principals.csv')

##### IMDb - Remove duplicates and all years before 1990

In [4]:
movies_90 = movies[movies.year >= 1990] # (53368, 22)
#movies_90 = movies.drop(movies[movies['year'] < 1990].index)
movies_us = movies[movies.country == 'USA']
movies_us_90 = movies_us[movies_us.year >= 1990]
movies_90['title'] = movies_90['title'].astype(str)
movies_90 = movies_90.drop_duplicates(subset = 'title')
mocies_us_90 = movies_us_90.drop_duplicates(subset = 'title')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


##### RT - Remove duplicates 

In [5]:

df_RT = pd.read_csv('RT.csv').drop_duplicates(subset = 'title')
df_90 = pd.merge(movies_90,df_RT,how='inner',on='title')
df_90 = df_90.drop_duplicates()
df_90_us = pd.merge(movies_us_90,df_RT,how='inner',on='title')
df_90_us = df_90_us.drop_duplicates()

In [6]:
# **** Remove NaN ****
df_90_income = df_90.dropna(subset = ['worlwide_gross_income']).reset_index(drop=True).drop_duplicates()
df_us = df_90_us[df_90_us['worlwide_gross_income'].notna()].reset_index(drop=True)

#df_90_income.dtypes
#df_90_income.head()
#print(df_90_income.shape)
#df_90
#df_90_income

##### Cleaning dataset and adjusting income and budget for inflation (CPI)

In [7]:
import cpi
#cpi.update()

def inflate_column(data, column):
    '''Inflate monetary figures from a given year to their 2018 values'''
    return data.apply(lambda x: cpi.inflate(x[column], x.year), axis=1)

In [8]:
df_90_income['worldwide_gross_income'] = df_90_income['worlwide_gross_income']\
                                        .apply(lambda x:x[1:] if x and x[:1]== '$' else None)
df_90_income = df_90_income.drop('worlwide_gross_income', axis=1)
df_90_income = df_90_income.dropna(subset = ['worldwide_gross_income'])

In [9]:
df_90_income = df_90_income.dropna(subset = ['budget'])
df_90_income['budget'] = df_90_income['budget'].apply(lambda x:x[1:] if x and x[:1]== '$' else None)

df_90_income = df_90_income.dropna(subset = ['usa_gross_income'])
df_90_income['usa_gross_income'] = df_90_income['usa_gross_income']\
                                        .apply(lambda x:x[1:] if x and x[:1]== '$' else None)

In [10]:
df_90_income = df_90_income.reset_index(drop=True).drop_duplicates()

df_90_income.worldwide_gross_income = df_90_income.worldwide_gross_income.astype(int)
df_90_income = df_90_income[df_90_income.year <= 2018]
df_90_income['adjusted_wgi'] = inflate_column(df_90_income, 'worldwide_gross_income')

df_budget = df_90_income.dropna(subset = ['budget']).copy()
df_budget.budget = df_budget.budget.astype(int)
df_budget['adjusted_budget'] = inflate_column(df_budget, 'budget')

df_budget = df_budget.dropna(subset = ['usa_gross_income']).copy()
df_budget['usa_gross_income'] = df_budget['usa_gross_income'].astype(int)
df_budget['adjusted_us_income'] = inflate_column(df_budget, 'usa_gross_income')

df_budget['net_income'] = df_budget.adjusted_wgi - df_budget.adjusted_budget
df_budget['net_us_income'] = df_budget['adjusted_us_income'] - df_budget['adjusted_budget']
df = df_budget.reset_index(drop=True)
#df_90_income.dtypes

In [11]:
directors = list(df['director'].values)


In [12]:
df.dtypes

imdb_title_id              object
title                      object
original_title             object
year                        int64
date_published             object
genre                      object
duration                    int64
country                    object
language                   object
director                   object
writer                     object
production_company         object
actors                     object
description                object
avg_vote                  float64
votes                       int64
budget                      int64
usa_gross_income            int64
metascore                 float64
reviews_from_users        float64
reviews_from_critics      float64
tomatometer_score          object
audience_score             object
worldwide_gross_income      int64
adjusted_wgi              float64
adjusted_budget           float64
adjusted_us_income        float64
net_income                float64
net_us_income             float64
dtype: object

In [13]:
# Converting Rotten Tomatoes scores to floats
df['tomatometer_score'] = df['tomatometer_score'].replace({'\%':''}, regex = True)
df['audience_score'] = df['audience_score'].replace({'\%':''}, regex = True)

df['tomatometer_score']= df['tomatometer_score'].astype(float)
df['audience_score']= df['audience_score'].astype(float)

In [14]:
director_scores = []
a = {}
for director in directors:
    a[director] = pd.DataFrame(df[df['director'].str.contains(director)])
    director_scores.append([director,
                           a[director]['avg_vote'].mean(),
                           a[director]['metascore'].mean(),
                           a[director]['tomatometer_score'].mean(),
                           a[director]['audience_score'].mean()])

In [23]:
cols = ['director', 'director_imdb', 'director_metascore', 'director_tomatometer', 'director_audience']
df_director = pd.DataFrame(director_scores, index=range(len(director_scores)),columns=cols)

In [31]:
len(directors)

4354

In [59]:
#writers = list(df.dropna(subset = ['writer'])['writer'].values)
#writers = list(df['writer'].values)
df = df.dropna(subset = ['writer'])

In [60]:
writers = list(df['writer'].values)
writer_scores = []
a = {}
for writer in writers:
    
    a[writer] = pd.DataFrame(df[df['writer'].str.contains(writer)])
    writer_scores.append([writer,
                           a[writer]['avg_vote'].mean(),
                           a[writer]['metascore'].mean(),
                           a[writer]['tomatometer_score'].mean(),
                           a[writer]['audience_score'].mean()])



In [62]:
cols = ['writer', 'writer_imdb', 'writer_metascore', 'writer_tomatometer', 'writer_audience']
df_writer = pd.DataFrame(writer_scores, index=range(len(writer_scores)),columns=cols)

In [70]:
actors = []

for actor in df['actors']:
    actors.append(actor.split(", "))

In [71]:
actors 

[['Meg Ryan',
  'Hugh Jackman',
  'Liev Schreiber',
  'Breckin Meyer',
  'Natasha Lyonne',
  'Bradley Whitford',
  'Paxton Whitehead',
  'Spalding Gray',
  'Josh Stamberg',
  'Matthew Sussman',
  'Charlotte Ayanna',
  'Philip Bosco',
  'Andrew Jack',
  'Stan Tracy',
  'Kristen Schaal'],
 ['Mel Gibson',
  'Robert Downey Jr.',
  'Nancy Travis',
  'Ken Jenkins',
  'David Marshall Grant',
  'Lane Smith',
  'Art LaFleur',
  'Ned Eisenberg',
  'Marshall Bell',
  'David Bowe',
  'Burt Kwouk',
  'Tim Thomerson',
  'Harvey Jason',
  'Sinjai Plengpanich',
  'Natta Nantatanti'],
 ['Joe Mantegna',
  'Mia Farrow',
  'William Hurt',
  'June Squibb',
  'Marceline Hugot',
  "Dylan O'Sullivan Farrow",
  'Matthew H. Williamson',
  'Julie Kavner',
  'Billy Taylor',
  'Holland Taylor',
  'Michael-Vaughn Sullivan',
  'Robin Bartlett',
  'Linda Wallem',
  'Gina Gallagher',
  'Patience Moore'],
 ['Paul Hogan',
  'Elias Koteas',
  'Linda Kozlowski',
  'Doreen Lang',
  'Douglas Seale',
  'Ruth Warshawsky',
  '

##### Clean US dataset for valuta (dollar sign)

In [None]:
df_us['wgi'] = df_us['worlwide_gross_income'].replace({'\$':''}, regex = True)
df_us['budget'] = df_us['budget'].replace({'\$':''}, regex = True)
df_us['usa_gross_income'] = df_us['usa_gross_income'].replace({'\$':''}, regex = True)


In [None]:
#len(df_us[df_us.year==2019])
df_us.wgi = df_us.wgi.astype(int)
df_us = df_us[df_us.year <= 2018]
df_us['adjusted'] = inflate_column(df_us, 'wgi')
#df_us

In [None]:
df_budget.columns.values

##### Seperating data into features (have greater flexibility for project when predicting)

In [None]:
df_ratings = [['avg_vote', 'metascore','tomatometer_score', 'audience_score']]
df_x = df_budget[['title', 'year', 'date_published', 'genre', 'duration','country', 'director', 'actors', 'budget','production_company','writer']]
df_y = df_budget[['net_income','usa_gross_income', 'worldwide_gross_income']]

In [None]:
df_x.to_csv('inputInfo_X.csv', encoding='utf-8',index=False)

In [None]:
df_y.to_csv('inputInfo_Y.csv', encoding='utf-8',index=False)

In [None]:
y = pd.read_csv("inputInfo_Y.csv")

In [None]:
def convert_to_array(rating):
    rating = str(rating)
    rating = rating.lower()
    rating = rating.split(",")
    return rating

##### Converting columns with multiple string entries to a list. This will assist for converting each string entry to dummy variables

In [None]:
x["genre"]= x["genre"].apply(lambda x: convert_to_array(x))
x["country"] = x["country"].apply(lambda x: convert_to_array(x))
x["director"] = x["director"].apply(lambda x: convert_to_array(x))
x["actors"] = x["actors"].apply(lambda x: convert_to_array(x))
x["title"] = x["title"].apply(lambda x: convert_to_array(x))
x["production_company"] = x["production_company"].apply(lambda x: convert_to_array(x))
x["writer"] = x["writer"].apply(lambda x: convert_to_array(x))

##### Converting published date to months

In [None]:
x["date_published"] = x["date_published"].apply(lambda x: pd.to_datetime(x).month)

##### Creating dummy variables 

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
MLB = MultiLabelBinarizer()

In [None]:
actor_dummy = pd.DataFrame(MLB.fit_transform(x["actors"]),columns = MLB.classes_,index=x.index)
genre_dummy = pd.DataFrame(MLB.fit_transform(x["genre"]),columns=MLB.classes_, index=x.index)
country_dummy = pd.DataFrame(MLB.fit_transform(x["country"]),columns = MLB.classes_,index=x.index)
director_dummy = pd.DataFrame(MLB.fit_transform(x["director"]),columns = MLB.classes_,index=x.index)

In [None]:
genre_dummy.head()

##### Looking at the predictor values

In [None]:
y_2 = y[["tomatometer_score","audience_score","metascore","avg_vote","usa_gross_income","worldwide_gross_income"]]

##### Clean dataset further: Remove % from score

In [None]:
y_2['tomatometer_score'] = y_2['tomatometer_score'].replace({'\%':''}, regex = True)
y_2['audience_score'] = y_2['audience_score'].replace({'\%':''}, regex = True)


##### Converting ratings to float 


In [None]:
y_2['tomatometer_score']= y_2['tomatometer_score'].astype(float)
y_2['audience_score']= y_2['audience_score'].astype(float)

#y_2.info()

##### Converting the audience and user rating to binary. 

In [None]:
y_2["t_rating"] = y_2["tomatometer_score"].apply(lambda x: 1 if x > 60.0 else 0)
y_2["audience_score"] = y_2["audience_score"].apply(lambda x: 1 if x > 60.0 else 0) 

##### Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig, ax = plt.subplots(figsize = (10,5))
sns.distplot(df_90_income.year, bins = 20);
plt.title("Distribution of movies over the years", size = 10);

In [None]:
sns.set(style="white")

plt.figure(figsize=(10,5))
plt.title('Movies by the year', size=15)
sns.distplot(df_90_income.year, kde=False)
plt.ylabel('Number of movies', size=15)
plt.xlabel('Year of release',size=15)
plt.axis([1990, 2019, 0, 700])
plt.xticks(np.arange(1990, 2019, step=2))
plt.show()

In [None]:
# Overview of the different genres 
df_90_income['first_genre'] = df_90_income['genre'].str.split(',').str[0]

a = plt.cm.binary

plt.figure(figsize=(10,4))
count = df_90_income['first_genre'].value_counts()[:7]
sns.barplot(count.values, count.index, palette=[a(0.1),a(0.2),a(0.3),a(0.4),a(0.5),a(0.6),a(0.7)])
for i, v in enumerate(count.values):
    plt.text(0.8,i,v,color='k',fontsize=10)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Genre name', fontsize=12)
plt.title("Genres", fontsize=15)
plt.show()

In [None]:
# Overview of movies with highest scores 
top_scored = df_90_income.sort_values(["tomatometer_score","audience_score","metascore","avg_vote"], ascending=False)[
    ["title", "director", "tomatometer_score","audience_score","metascore","avg_vote"]]
top_scored.index = range(len(df_90_income))
top_scored.head(n=10)

In [None]:
test = df_90_income.sort_values(["year"], ascending=False)[
    ["year","tomatometer_score","audience_score","metascore","avg_vote"]]
test.index = range(len(df_90_income))
#test.head(n=10)


In [None]:
test.plot(kind="scatter",
                      x="year",
                      y="metascore",
                      alpha=0.4)

plt.show()

In [None]:
#fig, ax = plt.subplots(figsize = (9,5))
#sns.distplot(df_90_income.tomatometer_score,bins = 50);
#plt.title("Distribution of metascore among movies", size = 10);


# Error: ould not convert string to float: '50%'
#df_90_income = df_90_income[(df_90_income.tomatometer_score.notnull()) &
#                      (df_90_income.audience_score.notnull())]
#sns.jointplot(x=df_90_income['year'], y=df_90_income['tomatometer_score'],
#              kind="kde").fig.set_size_inches(15,15)

# Works but not pretty 
#plt.figure(figsize=[30,10])
#sns.barplot(x=df_90_income.year, y=df_90_income.metascore, data=df_90_income)

In [None]:
# Ideas:
#1) Look at gender or race in director (men vs women) (balck vs white)

In [None]:
df_budget.plot(kind="scatter",
                      x="avg_vote",
                      y="net_income",
                      alpha=0.4)