<a href="https://colab.research.google.com/github/JasmineElm/Notebooks/blob/master/grab_FREE_BFI_Titles_on_1001M.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Version 2:
# + progressBar() function
# + isYear() function
# + only adds years that look like years

In [0]:
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from random import randint
from tabulate import tabulate #to export dataframe as markdown table
import requests,urllib.request, json 

def progressBar(inList, currentStep,numeric=False):
  """ Prints a progress bar of sorts to make it easier to judge
      whether your loops are doing something. use in conjunction with 
      `enumerate` to get the index of the current step
  """
  ps=['░','▒','▓','█']
  if numeric:
    if currentStep==0:
      print('iterations left: ',end='')
    print(len(inList)-currentStep,end=' ')
  else:
    if currentStep==0:
      print('Progress: ',end='')
    stage=int(currentStep/len(inList)*len(ps))
    print(ps[stage],end='')

def isYear(s):
  """ Simply checks whether a string looks like 
      a year in which a film could have been made
  """
  return s.isdigit() and 1880 <= int(s) <= 2030


In [0]:
## grab our 1001 Movies List, download, and load into a dataframe 
url ='https://gist.githubusercontent.com/JasmineElm/ce8219c58bd416c0aec588a97e168221/raw/57717b4ae21f1721b2e1c22d2e8a74795e0e54d4/netflixTitles.csv'
r = requests.get(url)
filename = url.split('/')[-1]
 
with open(filename,'wb') as output_file:
    output_file.write(r.content)

ml=pd.read_csv(filename,index_col=[0])


In [3]:
# free films are split into collections.  each collection is housed on a separate page.
# build a list of these collection links.
collections=[]
url='https://player.bfi.org.uk/free/collections'
page = requests.get(url)
soup = BeautifulSoup(page.text, "html5lib")
base='https://player.bfi.org.uk'

for collection in soup.findAll('div',{'class':'collection-card'}):
  collections.append(base+collection.find('a').get('href'))

print(len(collections))

74


In [11]:
movieList=[]
movieInfo=[]
progress=len(collections)

for idx,collection in enumerate(collections):
  progressBar(collections,idx) # make progress a little easier to see
  sleep(randint(0,2)) #let's not spam the server
  page = requests.get(collection)
  soup = BeautifulSoup(page.text, "html5lib")
  for card in soup.findAll('div',{'class':'card--free'}):
    movieInfo=[]
    movieInfo.append(base+card.find('a').get('href')) #direct link  
    movieInfo.append(card.find('h3').find('span').text) #title
    # whilst typically in the 2nd position,the year can be in any of the spans
    for attrib in card.find('p',{'class':'card__info'}).findAll('span'):
      if (isYear(attrib.text)):
        movieInfo.append(attrib.text)
    movieList.append(movieInfo)

Progress: ░░░░░░░░░░░░░░░░░░░▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓██████████████████

[['https://player.bfi.org.uk/free/film/watch-the-body-beautiful-1990-online',
  'The Body Beautiful',
  '1990'],
 ['https://player.bfi.org.uk/free/film/watch-indian-tales-1994-online',
  'Indian Tales',
  '1994'],
 ['https://player.bfi.org.uk/free/film/watch-future-lasts-a-long-time-1996-online',
  'Future Lasts a Long Time',
  '1996'],
 ['https://player.bfi.org.uk/free/film/watch-the-stream-1999-online',
  'The Stream',
  '1999'],
 ['https://player.bfi.org.uk/free/film/watch-dead-london-1996-online',
  'Dead London',
  '1996'],
 ['https://player.bfi.org.uk/free/film/watch-rave-1997-online',
  'Rave',
  '1997'],
 ['https://player.bfi.org.uk/free/film/watch-floating-1991-online',
  'Floating',
  '1991'],
 ['https://player.bfi.org.uk/free/film/watch-pace-1995-online',
  'Pace',
  '1995'],
 ['https://player.bfi.org.uk/free/film/watch-the-week-elvis-died-1997-online',
  'The Week Elvis Died',
  '1997'],
 ['https://player.bfi.org.uk/free/film/watch-paris-brixton-1997-online',
  'Paris, Brix

In [14]:
#clean up our movie list
df=pd.DataFrame(data=movieList, columns=['Link','Title','Year']).dropna()
df=df.dropna().astype({'Year': 'int64'})
#merge it with the 1001 movies list
on_nf = pd.merge(ml, df,  how='inner', left_on=['Title','Year'], right_on = ['Title','Year'])

#generate a nice markdown table that can be pasted into Reddit
if(on_nf.shape[0]>0):
  print(tabulate(on_nf.drop(['Metascore','Language','Country','IMDBRating', 'Plot','Awards','imdbID'],axis=1).sort_values('Title')
  , tablefmt="pipe", headers="keys", showindex=False))
else:
  print("no matches")

| Title              | Genre                             | Director       |   Year |   Runtime | Link                                                                     |
|:-------------------|:----------------------------------|:---------------|-------:|----------:|:-------------------------------------------------------------------------|
| A Trip to the Moon | Short, Adventure, Fantasy, Sci-Fi | Georges Méliès |   1902 |        13 | https://player.bfi.org.uk/free/film/watch-a-trip-to-the-moon-1902-online |
