<a href="https://colab.research.google.com/github/JasmineElm/Notebooks/blob/master/grab_ITVHub_Titles_on_1001M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
from tabulate import tabulate #to export dataframe as markdown table
import requests,urllib.request, json 

def parseOMDB(query, full=False):
  """ queries OMDB returns data in a format similar to
      that used by the 1001 Movies List (full=True)
  """
  metadata=[]
  with urllib.request.urlopen(query) as url:
      data = json.loads(url.read().decode())
      if full:
        metadata.append(data['Title'])
        metadata.append(data['Metascore'])
        metadata.append(data['Genre'])
        metadata.append(data['Director'])
        metadata.append(data['Language'])
        metadata.append(data['Country'])
        metadata.append(data['imdbRating'])
        metadata.append(data['Year'][:4])
        metadata.append(data['Plot'])
        metadata.append(data['Awards'])
        metadata.append(data['imdbID'])
        metadata.append(data['Runtime'][:-4])
      else:
        metadata.append(data['Title'])
        metadata.append(data['Year'])
  return metadata

def buildOMDBQuery(title, year=None, movie=True):
  """ simple function to build a query string for OMDB
      VERY beta: no logic around handling failed queries
  """
  omdbURL='https://www.omdbapi.com/?apikey='
  omdbAPI='' #your key here
  title='&t='+title.replace(' and ',' ').replace(' ','+') 
  # 'and' can make the term too specific ('starsky and hutch' will not return the movie, 'starsky hutch' will)
  query=omdbURL+omdbAPI+title
  if year !=None:
    query=query+'&y='+year
  if movie == True:
    query=query+'&type=movie'
  return query



In [0]:
## grab our 1001 Movies List, download, and load into a dataframe 
url ='https://gist.githubusercontent.com/JasmineElm/ce8219c58bd416c0aec588a97e168221/raw/57717b4ae21f1721b2e1c22d2e8a74795e0e54d4/netflixTitles.csv'
r = requests.get(url)
filename = url.split('/')[-1]
 
with open(filename,'wb') as output_file:
    output_file.write(r.content)

ml=pd.read_csv(filename,index_col=[0])


In [0]:
#download the webpage holding the movielist
url='https://www.itv.com/hub/categories/films'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")


In [0]:
#at time of writing, there isn't enough entries to justify 
# pagination this may change in future?


In [4]:
#like iPlayer, the metadata exists only on the individual pages.
movieList=[]
for category in soup.findAll('div',{'class','categories'}):
  for link in category.findAll('a',{'class':'complex-link'}):
    movieList.append(link.get('href'))
movieList

['https://www.itv.com/hub/carry-on-behind/CFD0107',
 'https://www.itv.com/hub/carry-on-dick/CFD0109',
 'https://www.itv.com/hub/carry-on-loving/CFD0116',
 'https://www.itv.com/hub/carry-on-follow-that-camel/CFD0121',
 'https://www.itv.com/hub/starsky-and-hutch/2a5351']

In [9]:
#itvPlayer doesn't have a whole lot of metadata
# lets' fill in the blanks using OMDB
titles=[]
omdbURL='https://www.omdbapi.com/?apikey='
omdbAPI='67e402aa'

outerList=[]
for movie in movieList:
  titles.append(movie)
  page = requests.get(movie)
  soup = BeautifulSoup(page.text, "html.parser")
  title=soup.find('h1').text.replace('and','').replace(' ','+')
  titles.append(buildOMDBQuery(title))
  outerList.append(titles)
  titles=[]
for query in outerList:
  queryResults=[]
  queryResults=parseOMDB(query[1])
  del query[1]
  for result in queryResults:
    query.append(result)
outerList

[['https://www.itv.com/hub/carry-on-behind/CFD0107',
  'Carry on Behind',
  '1975'],
 ['https://www.itv.com/hub/carry-on-dick/CFD0109', 'Carry on Dick', '1974'],
 ['https://www.itv.com/hub/carry-on-loving/CFD0116',
  'Carry on Loving',
  '1970'],
 ['https://www.itv.com/hub/carry-on-follow-that-camel/CFD0121',
  'Carry On... Follow That Camel',
  '1967'],
 ['https://www.itv.com/hub/starsky-and-hutch/2a5351',
  'Starsky & Hutch',
  '2004']]

In [6]:
#clean up our movie list
df=pd.DataFrame(data=outerList, columns=['Link','Title','Year']).dropna()
df['Year']=df['Year'].str[-4:]
df=df.dropna().astype({'Year': 'int64'})
#merge it with the 1001 movies list
on_nf = pd.merge(ml, df,  how='inner', left_on=['Title','Year'], right_on = ['Title','Year'])

#generate a nice markdown table that can be pasted into Reddit
if(on_nf.shape[0]>0):
  print(tabulate(on_nf.drop(['Metascore','Language','Country','IMDBRating', 'Plot','Awards','imdbID','Query'],axis=1).sort_values('Title')
  , tablefmt="pipe", headers="keys", showindex=False))
else:
  print("no matches")

no matches
