In [None]:
#Maria Williams  - Nov, 2021
#This script scrapes IMDb for film information

# First Step: 
Scrape list of movies tagged as 'based on novel' and released from years 1970 to 2021

In [1]:
#imports
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
import numpy as np
import requests
import re

In [2]:
#First Pass:
#1) feature films, 2) tagged 'based on novel', 3) release from years 2000 to 2021: 3,516 titles - 71 pages
#baseurl = 'https://www.imdb.com/search/keyword/?keywords=based-on-novel&ref_=kw_nxt&mode=detail&page={}&title_type=movie&release_date=2000%2C2021&sort=year,desc'

#Second Pass:
#1) feature films, 2) tagged 'based on novel', 3) release from years 1970 to 2000: 6,262 titles - 126 pages
baseurl = 'https://www.imdb.com/search/keyword/?keywords=based-on-novel&ref_=kw_ref_yr&mode=detail&page={}&title_type=movie&release_date=1970%2C2000&sort=year,desc'

#initialize dataframe
popcorn = pd.DataFrame(columns = ['Link','Title', 'Release_Date', 'ViewRating', 'Runtime', 'Genres', 'Description'])

In [3]:
#little function to extract text from bs4 or return blank string
def makestring(bs4):
    if bs4 is None:
        return ' '
    else:
        return bs4.string
    
#function to scrape list page
def GetPage(baseurl,num):
    
    pop = pd.DataFrame(columns = ['Link','Title', 'Release_Date', 'ViewRating', 'Runtime', 'Genres', 'Description'])
    
    page = requests.get(baseurl.format(num))
    soup = BeautifulSoup(page.content, 'html.parser')
    main = soup.find_all(attrs={'class':'lister-item mode-detail'})
    #print(len(main))

    for p in range(len(main)):
        #get title
        link = main[p].a['href']
        title = main[p].h3.a.string
        #get description
        desc = main[p].find_all('p')
        desc = desc[1].string
        #get a bunch of other things that may or may not be there  
        year = makestring(main[p].find(attrs={'class':'lister-item-year text-muted unbold'}))
        rating = makestring(main[p].find(attrs={'class':'certificate'}))
        length = makestring(main[p].find(attrs={'class':'runtime'}))
        genre = makestring(main[p].find(attrs={'class':'genre'}))
        gotit = [link,title,year,rating,length,genre,desc]
        pop.loc[len(pop)] = gotit

    #print(pop.info())
    return pop

In [4]:
#each page has 50 entries
##First pass: loop to get a 71 pages
##Second pass: loop to get 126 pages

#hard coded here as 2 pages
for page in range(2):
    popper = GetPage(baseurl,page+1)
    popcorn = popcorn.append(popper)

popcorn['Genres'] = popcorn['Genres'].str.strip()
popcorn['Description'] = popcorn['Description'].str.strip()

print(popcorn.info())
#print(popcorn.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 49
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Link          100 non-null    object
 1   Title         100 non-null    object
 2   Release_Date  100 non-null    object
 3   ViewRating    100 non-null    object
 4   Runtime       100 non-null    object
 5   Genres        100 non-null    object
 6   Description   79 non-null     object
dtypes: object(7)
memory usage: 6.2+ KB
None


In [7]:
#export dataframe
#popcorn.to_csv('FilmList2.csv', index = False)

# Second Step: 
Take list and scrape full IMDB page

In [8]:
#quick function to get text from between brackets
#turns out I could have just used .string ><
def findtext(w):
    w = str(w)
    w = w[w.find('>')+1:w.find('<', w.find('>'), len(w))]
    return w

#quick function to get money - does not convert to string at this time
def findmoney(w):
    w = str(w)
    w = w[w.find('$')+1:w.find('<', w.find('$'), len(w))]
    return w

In [9]:
###THIS IS THE FULL SCRAPE OF A PAGE###

def Get_Movie(url):
    # Request the page and use BeautifulSoup to extract the contents
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #get title
    title = soup.find(attrs={'data-testid':'hero-title-block__title'})
    title = findtext(title)

    #get release date and rating (PG, PG13, etc)
    DR = soup.find_all(attrs={'class':'TitleBlockMetaData__ListItemText-sc-12ein40-2 jedhex'})
    if DR is not None:
        if len(DR)==2:
            date = findtext(DR[0])
            rating = findtext(DR[1])
        else:
            date = findtext(DR)
            rating = None
    else:
        date = None
        rating = None

    #get a list of genres
    gen = soup.find(attrs={'data-testid':'storyline-genres'}) 
    if gen is not None:
        gen = gen.find_all('a')
        genres =''
        for t in range(len(gen)):
            genres = genres + findtext(gen[t]) + ' '
    else:
        genres = None

    #get box office info
    #budget often has '(estimated)' in it
    box = soup.find(attrs={'data-testid':'title-boxoffice-section'})
    if box is not None:
        budget = box.find(attrs={'data-testid':'title-boxoffice-budget'}) 
        budget = str(budget)
        budget = findmoney(budget)
        dom = box.find(attrs={'data-testid':'title-boxoffice-grossdomestic'})
        dom = findmoney(dom)
        ww = soup.find(attrs={'data-testid':'title-boxoffice-cumulativeworldwidegross'}) 
        ww = findmoney(ww)
    else:
        budget = None
        dom = None
        ww = None

    
    story = soup.find(attrs={'data-testid':'Storyline'}) 
    if story is not None:
        #get the description
        desc = story.find(attrs={'class':'ipc-html-content ipc-html-content--base'})
        if desc is not None:
            desc = desc.div
            desc = findtext(desc)
        else:
            desc = ''
        #get top tags
        #only shows top few tags, usually list tag is to full list which can be in hundreds
        kw = story.find(attrs={'data-testid':'storyline-plot-keywords'}) 
        if kw is not None:
            kw = kw.find_all('span')
            keywords = ''
            for t in range(len(kw)):
                #keywords.append(findtext(kw[t]))
                keywords = keywords + findtext(kw[t]) + ' '
        else:
            keywords = None
    else:
        desc = None
        keywords = None

    #get runtime in minutes
    rt = soup.find(attrs={'data-testid':'title-techspec_runtime'}) 
    if rt is not None:
        rt = rt.find('div')
        hrs = findtext(rt)
        rt = str(rt)
        mins = rt[::-1]
        mins = mins[7:]
        mins = mins[mins.find('<')+1:mins.find('>', mins.find('<'), len(mins))]
        if hrs==' ':
            runtime = int(mins)
        elif mins==' ':
            runtime = int(hrs)*60
        else:
            runtime = int(hrs)*60 + int(mins)
    else:
        runtime = None

    #put all that info in a list and return it 
    obs = [title,date,rating, genres, budget,dom,ww,desc,keywords, runtime]
    
    return obs

In [10]:
#ok now pull in our list of movies
#movies = pd.read_csv(r'C:\Users\maria\OneDrive\Documents\AIPI510\BooksToMoviesML\FilmList.csv')   <--- original 2000-2021 list
#movies = pd.read_csv(r'C:\Users\maria\OneDrive\Documents\AIPI510\BooksToMoviesML\FilmList2.csv')  <--- original 1970-2000 list
#movies = movies['Link']

#to use full script instead of files
movies = popcorn['Link']

#instantiate dataframe
data = pd.DataFrame(columns = ['Title', 'Release_Date', 'ViewRating', 'Genres', 'Budget', 'DomesticGross', 
                              'WorldwideGross', 'Description', 'Keywords', 'Runtime'])

#scrape provided pages
#this counter is because i had to run it in batches
#movies = movies[4610:]

for this in movies:
    #print(this)
    obs = Get_Movie('https://www.imdb.com{}'.format(this))
    data.loc[len(data)] = obs
    
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           100 non-null    object
 1   Release_Date    100 non-null    object
 2   ViewRating      65 non-null     object
 3   Genres          100 non-null    object
 4   Budget          69 non-null     object
 5   DomesticGross   69 non-null     object
 6   WorldwideGross  69 non-null     object
 7   Description     100 non-null    object
 8   Keywords        100 non-null    object
 9   Runtime         100 non-null    object
dtypes: object(10)
memory usage: 8.6+ KB
None


In [None]:
#data.to_csv('New1652.csv',index=False)
#2106 first pull, 2505 next pull, 1652 last pull

In [11]:
#this cell is to combine different batch files
one = pd.read_csv('New2106.csv')
two = pd.read_csv('New2505.csv')
three = pd.read_csv('New1652.csv')

together = pd.concat([one,two,three])
together.drop_duplicates(inplace=True, ignore_index=True)

print(together.info())
#together.to_csv('IMDbScrape2.csv',index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'New2106.csv'