<a href="https://colab.research.google.com/github/JasmineElm/Notebooks/blob/master/grab_iPlayer_Titles_on_1001M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from time import sleep
from random import randint
from tabulate import tabulate #to export dataframe as markdown table

In [0]:
## grab our 1001 Movies List, download, and load into a dataframe 
url ='https://gist.githubusercontent.com/JasmineElm/ce8219c58bd416c0aec588a97e168221/raw/57717b4ae21f1721b2e1c22d2e8a74795e0e54d4/netflixTitles.csv'
r = requests.get(url)
filename = url.split('/')[-1]
 
with open(filename,'wb') as output_file:
    output_file.write(r.content)

ml=pd.read_csv(filename,index_col=[0])


In [0]:
#download the webpage holding the movielist
url='https://www.bbc.co.uk/iplayer/categories/films/a-z'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")


In [0]:
# results are paginated, we need to build a list of those pages
# pagination seems to take the form:
# + [1][2][3] rather than 
# + [1][..][3][next] (i.e. each page has an explicit link)
# this logic may need changing if iPlayer suddenly gets loads of movies...

pageList=[url]
for link in soup.find('div',{'class':'list__pagination'}).find_all('a'):
  pageList.append(url+link.get('href'))

pageList=set(pageList)

In [12]:
# for each page, we need to get the details of each movie.
# these are held on separate pages.  
# let's build a list of those links
movieList=[]
baseLink='https://www.bbc.co.uk'
for page in pageList:
  diary = requests.get(page)
  soup = BeautifulSoup(diary.text, "html.parser")
  for tile in soup.findAll('div',{'class':'content-item'}):
    movieList.append(baseLink+tile.find('a').get('href'))

https://www.bbc.co.uk/iplayer/categories/films/a-z
https://www.bbc.co.uk/iplayer/categories/films/a-z?page=2


In [13]:
# for movie in movieList:
outerList=[]
for movie in movieList:
  sleep(randint(0,3))
  innerList=[movie]
  page = requests.get(movie)
  soup = BeautifulSoup(page.text, "html.parser")
  movie=soup.find('div',{'class':'play-cta__title-container'}).text
  innerList.append(soup.find('div',{'class':'play-cta__title-container'}).text)
  meta=soup.find('div',{'class':'gel-layout'}).findAll('li')
  for metadata in meta:
    innerList.append(metadata.find('span',{'class':'episode-metadata__text'}).text)
  outerList.append(innerList)
print(outerList)

[['https://www.bbc.co.uk/iplayer/episode/p04b183c/adam-curtis-hypernormalisation', 'Adam Curtis', '167 mins', '16 Oct 2016', 'Available for over a year'], ['https://www.bbc.co.uk/iplayer/episode/p07ctvvn/a-high-school-rape-goes-viral-roll-red-roll', 'A High School Rape Goes Viral: Roll Red Roll', '74 mins', '23 Jun 2019', 'Available for 4 months'], ['https://www.bbc.co.uk/iplayer/episode/b0078cwc/a-simple-plan', 'A Simple Plan', '114 mins', '1998', 'Available for 4 months'], ['https://www.bbc.co.uk/iplayer/episode/b01dtlxl/the-awakening', 'The Awakening', '100 mins', '2011', 'Available for 24 days'], ['https://www.bbc.co.uk/iplayer/episode/b06vq3yn/the-big-short', 'The Big Short', '122 mins', '2015', 'Available for 10 days'], ['https://www.bbc.co.uk/iplayer/episode/m0006wg1/the-bleeder', 'The Bleeder', '90 mins', '2016', 'Available until Tue 2:05am'], ['https://www.bbc.co.uk/iplayer/episode/b010y7my/the-boy-in-the-striped-pyjamas', 'The Boy in the Striped Pyjamas', '88 mins', '2008', '

In [16]:
#clean up our movie list
df=pd.DataFrame(data=outerList, columns=['link','Title','durn','Year','availabiliy']).dropna()
df['Year']=df['Year'].str[-4:]
df['durn']=df['durn'].str[:-4]
df=df.dropna().astype({'Year': 'int64'})
#merge it with the 1001 movies list
on_nf = pd.merge(ml, df,  how='inner', left_on=['Title','Year'], right_on = ['Title','Year'])

#generate a nice markdown table that can be pasted into Reddit
print(tabulate(on_nf.drop(['Metascore','Language','Country','IMDBRating', 'Plot','Awards','imdbID','durn'],axis=1).sort_values('Title')
, tablefmt="pipe", headers="keys", showindex=False))


| Title                    | Genre                                    | Director                           |   Year |   Runtime | link                                                                    | availabiliy               |
|:-------------------------|:-----------------------------------------|:-----------------------------------|-------:|----------:|:------------------------------------------------------------------------|:--------------------------|
| Cat People               | Fantasy, Horror, Thriller                | Jacques Tourneur                   |   1942 |        73 | https://www.bbc.co.uk/iplayer/episode/b0078ns6/cat-people               | Available for over a year |
| Night of the Living Dead | Horror                                   | George A. Romero                   |   1968 |        96 | https://www.bbc.co.uk/iplayer/episode/b0078pw1/night-of-the-living-dead | Available for over a year |
| Oklahoma!                | Comedy, Drama, Musical, Romance, Western | 