In [1]:
#Maria Williams - Nov 2021
#IMDB scraper: uses list of page links generated by IMDbListScraper

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re

In [2]:
#quick function to get text from between brackets
#turns out I could have just used .string ><
def findtext(w):
    w = str(w)
    w = w[w.find('>')+1:w.find('<', w.find('>'), len(w))]
    return w

#quick function to get money - does not convert to string at this time
def findmoney(w):
    w = str(w)
    w = w[w.find('$')+1:w.find('<', w.find('$'), len(w))]
    return w

In [13]:
###THIS IS THE FULL SCRAPE OF A PAGE###

def Get_Movie(url):
    # Request the page and use BeautifulSoup to extract the contents
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #get title
    title = soup.find(attrs={'data-testid':'hero-title-block__title'})
    title = findtext(title)

    #get release date and rating (PG, PG13, etc)
    DR = soup.find_all(attrs={'class':'TitleBlockMetaData__ListItemText-sc-12ein40-2 jedhex'})
    if DR is not None:
        if len(DR)==2:
            date = findtext(DR[0])
            rating = findtext(DR[1])
        else:
            date = findtext(DR)
            rating = None
    else:
        date = None
        rating = None

    #get a list of genres
    gen = soup.find(attrs={'data-testid':'storyline-genres'}) 
    if gen is not None:
        gen = gen.find_all('a')
        genres =''
        for t in range(len(gen)):
            genres = genres + findtext(gen[t]) + ' '
    else:
        genres = None

    #get box office info
    #budget often has '(estimated)' in it
    box = soup.find(attrs={'data-testid':'title-boxoffice-section'})
    if box is not None:
        budget = box.find(attrs={'data-testid':'title-boxoffice-budget'}) 
        budget = str(budget)
        budget = findmoney(budget)
        dom = box.find(attrs={'data-testid':'title-boxoffice-grossdomestic'})
        dom = findmoney(dom)
        ww = soup.find(attrs={'data-testid':'title-boxoffice-cumulativeworldwidegross'}) 
        ww = findmoney(ww)
    else:
        budget = None
        dom = None
        ww = None

    
    story = soup.find(attrs={'data-testid':'Storyline'}) 
    if story is not None:
        #get the description
        desc = story.find(attrs={'class':'ipc-html-content ipc-html-content--base'})
        if desc is not None:
            desc = desc.div
            desc = findtext(desc)
        else:
            desc = ''
        #get top tags
        #only shows top few tags, usually list tag is to full list which can be in hundreds
        kw = story.find(attrs={'data-testid':'storyline-plot-keywords'}) 
        if kw is not None:
            kw = kw.find_all('span')
            keywords = ''
            for t in range(len(kw)):
                #keywords.append(findtext(kw[t]))
                keywords = keywords + findtext(kw[t]) + ' '
            #if 'more' in keywords[-1]:
                #keywords = keywords[:-1]
        else:
            keywords = None
    else:
        desc = None
        keywords = None

    #get runtime in minutes
    rt = soup.find(attrs={'data-testid':'title-techspec_runtime'}) 
    if rt is not None:
        rt = rt.find('div')
        hrs = findtext(rt)
        rt = str(rt)
        mins = rt[::-1]
        mins = mins[7:]
        mins = mins[mins.find('<')+1:mins.find('>', mins.find('<'), len(mins))]
        if hrs==' ':
            runtime = int(mins)
        elif mins==' ':
            runtime = int(hrs)*60
        else:
            runtime = int(hrs)*60 + int(mins)
    else:
        runtime = None

    #put all that info in a list and return it 
    obs = [title,date,rating, genres, budget,dom,ww,desc,keywords, runtime]
    
    return obs

In [18]:
#ok now pull in our list of movies
#movies = pd.read_csv(r'C:\Users\maria\OneDrive\Documents\AIPI510\BooksToMoviesML\FilmList.csv') <--- original 2000-2021 list
movies = pd.read_csv(r'C:\Users\maria\OneDrive\Documents\AIPI510\BooksToMoviesML\FilmList2.csv')
movies = movies['Link']

#instantiate dataframe
data = pd.DataFrame(columns = ['Title', 'Release_Date', 'ViewRating', 'Genres', 'Budget', 'DomesticGross', 
                              'WorldwideGross', 'Description', 'Keywords', 'Runtime'])

In [19]:
#scrape provided pages
#this counter is because i had to run it in batches
movies = movies[4610:]

for this in movies:
    #print(this)
    obs = Get_Movie('https://www.imdb.com{}'.format(this))
    data.loc[len(data)] = obs
        

In [20]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1652 entries, 0 to 1651
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Title           1652 non-null   object
 1   Release_Date    1652 non-null   object
 2   ViewRating      635 non-null    object
 3   Genres          1595 non-null   object
 4   Budget          327 non-null    object
 5   DomesticGross   327 non-null    object
 6   WorldwideGross  327 non-null    object
 7   Description     1652 non-null   object
 8   Keywords        1652 non-null   object
 9   Runtime         1600 non-null   object
dtypes: object(10)
memory usage: 142.0+ KB
None


In [21]:
data.to_csv('New1652.csv',index=False)
#2106 first pull, 2505 next pull, 1652 last pull

In [23]:
#this cell is just to slap everything together since I had trouble running the full scrape

one = pd.read_csv('New2106.csv')
two = pd.read_csv('New2505.csv')
three = pd.read_csv('New1652.csv')

together = pd.concat([one,two,three])
#together = together.applymap(str)
together.drop_duplicates(inplace=True, ignore_index=True)

print(together.info())

together.to_csv('IMDbScrape2.csv',index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6262 entries, 0 to 6261
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Title           6262 non-null   object 
 1   Release_Date    6262 non-null   int64  
 2   ViewRating      2309 non-null   object 
 3   Genres          6051 non-null   object 
 4   Budget          1689 non-null   object 
 5   DomesticGross   1689 non-null   object 
 6   WorldwideGross  1689 non-null   object 
 7   Description     5197 non-null   object 
 8   Keywords        6262 non-null   object 
 9   Runtime         6036 non-null   float64
dtypes: float64(1), int64(1), object(8)
memory usage: 489.3+ KB
None
