In [83]:
import numpy as np
import pandas as pd
import glob as gl
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
#from random import sample


%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We downloaded data from the "Internet movie data base" and want to persue some analysis on whether the gender of the directors is somehow related to how movies are rated. https://www.imdb.com/interfaces/

In [2]:
names = pd.read_csv('name.basics.tsv\data.tsv',sep='\t',usecols = [0,1])

In [3]:
ratings = pd.read_csv('title.ratings.tsv\data.tsv',sep='\t')

crew = pd.read_csv('title.crew.tsv\data.tsv',sep='\t')

In [4]:
title = pd.read_csv('title.basics.tsv\data.tsv',sep='\t',usecols = [0,1,2,3,4,5,8])


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
title = title[(title.isAdult == 0) & (title['titleType'] == 'movie') & (title['startYear'] != '\\N')]
title = title.drop(['isAdult','titleType'],axis =1)

We reduce the dataset a bit and look only at movies from 2019:

In [6]:
title_Period = title[(title.startYear.astype(int) == 2000) ].copy()

In [7]:
title_Period

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,genres
100128,tt0102362,Istota,Istota,2000,"Drama,Romance"
105380,tt0107706,Nothing,Nothing,2000,\N
110583,tt0113026,The Fantasticks,The Fantasticks,2000,"Musical,Romance"
110641,tt0113086,Florentino y el diablo,Florentino y el diablo,2000,Drama
110647,tt0113092,For the Cause,For the Cause,2000,"Action,Adventure,Drama"
...,...,...,...,...,...
6113572,tt9748646,Uma Outra Cidade,Uma Outra Cidade,2000,Documentary
6116662,tt9755166,The Sahara's Secret Garden,The Sahara's Secret Garden,2000,Documentary
6121529,tt9765426,Hammering It Out,Hammering It Out,2000,\N
6136892,tt9797592,Karnaza,Karnaza,2000,Horror


In [8]:
list_of_title_numbers = title_Period.tconst.tolist()

In [9]:
title.head(2)

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,genres
8,tt0000009,Miss Jerry,Miss Jerry,1894,Romance
145,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport"


Replacing the string '\N' with np.nan:

In [10]:
crew = crew.replace(r'\N', np.nan)
ratings = ratings.replace(r'\N', np.nan)
names = names.replace(r'\N', np.nan)

In [11]:
[len(crew),len(ratings),len(names)]

[6193756, 976773, 9604401]

In [12]:
crew.head(2)

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,


In [13]:
names.head(2)

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall


In [14]:
ratings.head(3)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1538
1,tt0000002,6.1,185
2,tt0000003,6.5,1193


We want to merge all three dataframes. Ultimately, we want to compare the rating with the names of the directors. Therefore, we only need to keep the lines that display a rating: how = 'inner' is sufficient.

In [15]:
ratingcrew = pd.merge(ratings,crew, how = 'inner', on=['tconst'])

In [16]:
names = names.rename(columns = {'nconst':'directors'})

In [17]:
ratingcrewnames = pd.merge(ratingcrew, names, how = 'inner', on=['directors'])

In [18]:
[len(ratingcrew), len(ratingcrewnames)]

[976773, 723690]

In [19]:
ratingcrewnames.tail(4)

Unnamed: 0,tconst,averageRating,numVotes,directors,writers,primaryName
723686,tt9913056,6.3,6,nm1502645,,Sarah Christman
723687,tt9914642,8.6,17,nm5300859,"nm5300859,nm7332227",Chris Jordan
723688,tt9914644,8.4,21,nm10537376,,Grace Chapman
723689,tt9915790,6.4,5,nm10538030,,Sudipa Chatterjee


Check whether there are more than one director: the code for director seems to always have more or less the same length. Therefore, we will test whether some entries are a longer string and look at these entries. 

In [20]:
new_df = ratingcrewnames[ratingcrewnames.apply(lambda x : len(x['directors'])>=12,axis=1)]

In [21]:
new_df

Unnamed: 0,tconst,averageRating,numVotes,directors,writers,primaryName


Seems that there are no double entries for directors

Check whether there is data on writers:

In [22]:
len(ratingcrewnames) == ratingcrewnames.writers.isnull().sum()

False

There is, but we won't use it for now

We drop the columns that we don't need:

In [23]:
len(ratingcrewnames.averageRating)

723690

Creating a new column with the first names:

In [24]:
ratingcrewnames['firstName'] = ratingcrewnames['primaryName'].apply(lambda x : (x.split())[0])

In [25]:
ratingcrewnames.firstName.value_counts()

Michael         14693
John            13706
David           13616
James            8572
Peter            8482
                ...  
Tahsine             1
Anshuman            1
Jolynn              1
Shrikrishnan        1
Thambi              1
Name: firstName, Length: 25971, dtype: int64

In [26]:
firstNamesList = ratingcrewnames.firstName.unique()

In [27]:
len(firstNamesList)

25971

In [28]:
#import gender_guesser.detector as gender
#d = gender.Detector()

In [29]:
#d.get_gender(firstNamesList[0])

In [30]:
#ratingcrewnames.head(2)

In [31]:
#ratingcrewnames['gender'] = ratingcrewnames['firstName'].agg( d.get_gender)

get_name seems to not be able to identiy a lot of names. What's the percentage of names it doesn't know? And what's the percentage of movies, of which we don't know the name of the director?

In [32]:
#[ratingcrewnames[ratingcrewnames['gender'] == 'unknown']['firstName'].unique(),
# ratingcrewnames[ratingcrewnames['gender'] == 'unknown']['firstName'].nunique(),
# ratingcrewnames['firstName'].nunique()]

In [33]:
#round(100*(ratingcrewnames[ratingcrewnames['gender'] == 'unknown']['firstName'].nunique())/
#      ( ratingcrewnames['firstName'].nunique()),2)

In [34]:
#round(100*(len(ratingcrewnames[ratingcrewnames['gender'] == 'unknown']))/
#      ( len(ratingcrewnames['firstName'])),2)

Roughly 10% of the movies are affected. We will see whether there are some very popular unknown names that we can add and thereby push the percentage down.

In [35]:
#ratingcrewnames[ratingcrewnames['gender'] == 'unknown']['firstName'].value_counts()

In [36]:
#100*(1014 + 484 + 469 + 462 + 425)/14652

the five most popular names together are roughly 20% of all unknowns. If we asign the gender, it will not have a massive impact. But as we am here to play around with datasets, we'll do it anyways.

In [37]:
#ratingcrewnames[ratingcrewnames['firstName'] == 'J.']['primaryName'].value_counts()

it would be practical to read the second name instead of the first initial:

In [38]:
#ratingcrewnames.loc[ratingcrewnames['firstName'] == 'J.','secondName'] = ratingcrewnames.loc[ratingcrewnames['firstName'] == 'J.']['primaryName'].apply(lambda x : (x.split())[1])

In [39]:
#ratingcrewnames.loc[~ratingcrewnames['secondName'].isnull(), 'firstName'] = ratingcrewnames['secondName']

In [40]:
#ratingcrewnames.loc[ratingcrewnames['firstName'] == 'J.']

all J. are replaced. This is not a perfect method because some don't display a second name and the last name will be rated is first name. Overall, the resulting error is negligible. We will add the gender of the next most popular ones and then look at the percentage of movies affected

In [41]:
#ratingcrewnames.loc[ratingcrewnames['firstName'] == 'Yasuichirô','gender'] = 'male'
#ratingcrewnames.loc[ratingcrewnames['firstName'] == 'Gerren','gender'] = 'male'

In [42]:
#ratingcrewnames.loc[ratingcrewnames['firstName'] == 'K.','secondName'] = ratingcrewnames.loc[ratingcrewnames['firstName'] == 'K.']['primaryName'].apply(lambda x : (x.split())[1])
#ratingcrewnames.loc[~ratingcrewnames['secondName'].isnull(), 'firstName'] = ratingcrewnames['secondName']

In [43]:
#ratingcrewnames.loc[ratingcrewnames['firstName'] == 'D.W.']['primaryName'].value_counts()

In [44]:
#ratingcrewnames.loc[ratingcrewnames['firstName'] == 'D.W.','gender'] = 'male'

<h4> using beautiful soap to get the synopsis of movies from IMDB </h4>

In [45]:
from bs4 import BeautifulSoup
import requests
import json

from https://github.com/msaqib4203/IMDB-API :

In [46]:
def parsePersons(persons):
			
	names = []
	if isinstance(persons,dict):
		names.append(persons['name'])
		return names
		
	for person in persons:
		if person['@type'] == "Person":
			names.append(person['name'])
	return names

def getJSON(html):

	data = {}
	data['id'] =  html.find(attrs={'property':'pageId'})['content']
	data['url'] = 'https://www.imdb.com/title/'+data['id']
	html_json =  html.find(attrs={'type':'application/ld+json'}).text.strip()
	fetchedJson = json.loads(html_json)
	data['poster'] = html.find(attrs={'class':'poster'}).find('img')['src']
	title_wrapper =  html.find(attrs={'class':'title_wrapper'}).text.strip()
	data['title'] = title_wrapper[:title_wrapper.find(')')+1]
	data['rating'] = html.find(itemprop='ratingValue').text
	data['bestRating'] = html.find(itemprop='bestRating').text
	data['votes'] = html.find(itemprop='ratingCount').text
	data['rated'] = fetchedJson['contentRating']
	data['genres'] = fetchedJson['genre']
	data['description'] = fetchedJson['description']
	data['cast'] = parsePersons(fetchedJson['actor'])
	data['writers'] = parsePersons(fetchedJson['creator'])		
	data['directors'] = parsePersons(fetchedJson['director'])	
	json_data = json.dumps(data)
	return json_data

def getJSON2(html):

	data = {}
	data['id'] =  html.find(attrs={'property':'pageId'})['content']
	data['url'] = 'https://www.imdb.com/title/'+data['id']
	html_json =  html.find(attrs={'type':'application/ld+json'}).text.strip()
	fetchedJson = json.loads(html_json)
	data['poster'] = html.find(attrs={'class':'poster'}).find('img')['src']
	title_wrapper =  html.find(attrs={'class':'title_wrapper'}).text.strip()
	data['title'] = title_wrapper[:title_wrapper.find(')')+1]
	data['rating'] = html.find(itemprop='ratingValue').text
#	data['bestRating'] = html.find(itemprop='bestRating').text
	data['votes'] = html.find(itemprop='ratingCount').text
#	data['rated'] = fetchedJson['contentRating']
	data['genres'] = fetchedJson['genre']
	data['description'] = fetchedJson['description']
	data['cast'] = parsePersons(fetchedJson['actor'])
	data['writers'] = parsePersons(fetchedJson['creator'])		
	data['directors'] = parsePersons(fetchedJson['director'])	
	json_data = json.dumps(data)
	return data


	
def getHTML(url):
	
	response = requests.get(url)
	return BeautifulSoup(response.content,'html.parser')	
	
def getURL(input):
	try:
		if input[0] == 't' and input[1] == 't':
			html = getHTML('https://www.imdb.com/title/'+input+'/')
			
		else:
			html = getHTML('https://www.google.co.in/search?q='+input)
			for cite in html.findAll('cite'):
				if 'imdb.com/title/tt' in cite.text:
					html = getHTML(cite.text)
					break
		return getJSON(html)	
	except Exception as e:
		print(e)
		return 'Invalid input or Network Error!'
    
    
    
def getURL2(input):
	try:
		if input[0] == 't' and input[1] == 't':
			html = getHTML('https://www.imdb.com/title/'+input+'/')
			
		else:
			html = getHTML('https://www.google.co.in/search?q='+input)
			for cite in html.findAll('cite'):
				if 'imdb.com/title/tt' in cite.text:
					html = getHTML(cite.text)
					break
		return getJSON2(html)	
	except Exception as e:
		print(e)
		return 'Invalid input or Network Error!'    
		
	
#input = raw_input("Enter IMDB ID or Title: ")
#print('Getting information, Please Wait....')
#print(getURL(input))

Test with "Meet the feebles" : https://www.imdb.com/title/tt0097858/?ref_=nv_sr_1?ref_=nv_sr_1
movie number is tt0097858

In [47]:
mtf = 'tt0097858'

In [48]:
mtf2 = getURL2(mtf)

In [49]:
mtf2

{'id': 'tt0097858',
 'url': 'https://www.imdb.com/title/tt0097858',
 'poster': 'https://m.media-amazon.com/images/M/MV5BNzFlM2ZlYWItNjg1Mi00ZTA4LWFmYzgtZTI3YWZkNTJjMmQ1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_UX182_CR0,0,182,268_AL_.jpg',
 'title': 'Meet the Feebles\xa0(1989)',
 'rating': '6.7',
 'votes': '18,173',
 'genres': ['Comedy', 'Music', 'Musical'],
 'description': 'Meet the Feebles is a movie starring Danny Mulheron, Donna Akersten, and Stuart Devenie. Multiple animals and insects experience the sleazier side of show business while working on a variety show.',
 'cast': ['Danny Mulheron', 'Donna Akersten', 'Stuart Devenie', 'Mark Hadlow'],
 'writers': ['Fran Walsh',
  'Stephen Sinclair',
  'Danny Mulheron',
  'Peter Jackson'],
 'directors': ['Peter Jackson']}

In [50]:
mtf2["description"]

'Meet the Feebles is a movie starring Danny Mulheron, Donna Akersten, and Stuart Devenie. Multiple animals and insects experience the sleazier side of show business while working on a variety show.'

In [51]:
#df_descr = pd.DataFrame(columns = ['tconst','description'])
#i = 0
#for tt in list_of_title_numbers:
#    xx = getURL2(tt)
#    try:
#        new_row = [tt, xx['description']]
#        df_descr.loc[i] = new_row
#        i +=1
#    except:
#        print('whoopsies')
#
#df_descr.to_csv('imdb.csv')

we downloaded the data and saved it as a .csv

In [76]:
imdb_folder = gl.glob('IMDB/IMDB*.csv')
imdb_df_list = []
for xx in range(len(imdb_folder)):
    imdb_read = pd.read_csv(imdb_folder[xx], header = 0)
    imdb_df_list.append(imdb_read)
descr = pd.concat(imdb_df_list, ignore_index = True)
descr = descr.drop(['Unnamed: 0'], axis = 1)

In [77]:
descr.head(2)

Unnamed: 0,tconst,description
0,tt0113026,"The Fantasticks is a movie starring Joel Grey,..."
1,tt0113092,"For the Cause is a movie starring Dean Cain, T..."


In [75]:
title.head(2)

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,genres
8,tt0000009,Miss Jerry,Miss Jerry,1894,Romance
145,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,"Documentary,News,Sport"


In [78]:
imdb = pd.merge(title, descr, on = 'tconst' )

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [120]:
descr_array = imdb.description.values
vectorizer = TfidfVectorizer(min_df = 10, max_df = 250)
X = vectorizer.fit_transform(descr_array)

In [124]:
X.toarray().max()

0.8630159124991997

In [118]:
X.shape

(6289, 2544)

In [128]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)

In [129]:
kmeans.cluster_centers_.shape

(10, 2544)