## Web Scraping with Beautiful Soup

In [2]:
import pandas as pd

from bs4 import BeautifulSoup
from urllib.request import urlopen

## I. Bechdel Test website

#### A. First Page Trial Run

In [3]:
base_url = 'https://bechdeltest.com/'

# open connection, grab page
client = urlopen(base_url)

# store html in variable
base_html = client.read()

# close connection
client.close()

In [4]:
# parse html
soup = BeautifulSoup(base_html, 'html.parser')

In [5]:
soup.h1

<h1 style="margin-bottom: 5px"><a href="/" style="text-decoration:none;">Bechdel Test Movie List</a></h1>

In [6]:
soup.p

<p>The <b>Bechdel Test</b>, sometimes called the <i>Mo Movie Measure</i> or <i>Bechdel Rule</i> is a simple test which names the following three criteria: (1) it has to have at least two women in it, who (2) who talk to each other, about (3) something besides a man. The test was popularized by <b>Alison Bechdel</b>'s comic <b>Dykes to Watch Out For</b>, in a 1985 strip called <a href="http://alisonbechdel.blogspot.com/2005/08/rule.html"><i>The Rule</i></a>. For a nice video introduction to the subject please check out <a href="http://www.feministfrequency.com/2009/12/the-bechdel-test-for-women-in-movies/">The Bechdel Test for Women in Movies</a> on <a href="http://www.feministfrequency.com/">feministfrequency.com</a>.</p>

In [7]:
# grab each movie
movies = soup.findAll('div', {'class': 'movie'})

In [8]:
len(movies)

200

In [9]:
movies[0]

<div class="movie"><a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>
<a href="/view/8642/alita:_battle_angel/" id="movie-8642">Alita: Battle Angel</a> <a href="/view/8642/alita:_battle_angel/" onclick="showComments('8642'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8642" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a> </div>

In [10]:
# movies object contains imdb id (within imdb link), Bechdel Test score, 'pass' or 'nopass', movie title in 3 
# different formats, and number of comments
movie = movies[0]

In [34]:
len(movie.contents)

6

In [35]:
movie.contents

[<a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>,
 '\n',
 <a href="/view/8642/alita:_battle_angel/" id="movie-8642">Alita: Battle Angel</a>,
 ' ',
 <a href="/view/8642/alita:_battle_angel/" onclick="showComments('8642'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8642" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a>,
 ' ']

In [47]:
movie.parent.h3.a['id'].split('-')[1]

'2019'

In [39]:
movies[1].contents[2].text

'American Woman'

In [41]:
movies[75].contents[2].text.strip()

'Hold the Dark'

In [11]:
movie.a

<a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>

In [16]:
movie.a.img

<img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/>

In [18]:
movie.a.img['alt']

'[[3]]'

In [19]:
# yessss! the score!!! 
movie.a.img['alt'][2]

'3'

In [20]:
movie.a['href']

'http://us.imdb.com/title/tt0437086/'

In [21]:
movie.a['href'].split('/')

['http:', '', 'us.imdb.com', 'title', 'tt0437086', '']

In [22]:
movie.a['href'].split('/')[4]

'tt0437086'

In [28]:
movie.a.img['src'].split('/')[2].split('.')[0]

'pass'

In [17]:
movies[3]

<div class="movie"><a href="http://us.imdb.com/title/tt5719748/"><img alt="[[1]]" src="/static/nopass.png" title="[There are two or more women in this movie, but they don't talk to each other]"/></a>
<a href="/view/8639/cold_pursuit/" id="movie-8639">Cold Pursuit</a> <a href="/view/8639/cold_pursuit/" onclick="showComments('8639'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8639" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a> </div>

In [29]:
movies[3].a.img['src'].split('/')[2].split('.')[0]

'nopass'

In [48]:
movies[100].parent.h3.a['id'].split('-')[1]

'2019'

In [66]:
movies[100].find_previous('h3').a['id'].split('-')[1]

'2018'

In [67]:
movies[199].find_previous('h3').a['id'].split('-')[1]

'2017'

In [58]:
int(movies[10].a.img['alt'][2])

3

In [49]:
movies[100].contents[2].text.strip()

'The Miseducation of Cameron Post'

In [None]:
# could write directly to csv, but I'm not going to
# filename = 'bechdel_test_movies.csv'
# f = open(filename, 'w')
# headers = 'imdb_id, imdb_link, test_score, passing, title\n'
# f.write(headers)

# build loop
# if writing to pandas df, change year and test_score to integers

# for movie in movies:
#     year = movie.find_previous('h3').a['id'].split('-')[1]
#     title = movies.contents[2].text.strip()
#     test_score = movie.a.img['alt'][2]
#     passing = movie.a.img['src'].split('/')[2].split('.')[0]
#     imdb_id = movie.a['href'].split('/')[4]
#     imdb_link = movie.a['href']
    
    # f.write(imdb_id + ',' + imdb_link + ',' + test_score + ',' + passing.replace(',','|') + ',' + 'title' + '\n')
    # if a title has any punctuation. . . this may not work, should probably go into a dataframe
# f.close()

In [69]:
# test run on page one (sorting by year)

years = []
movie_titles = []
test_scores = []
pass_or_no_pass = []
imdb_ids = []
imdb_links = []

for movie in movies:
    
    year = movie.find_previous('h3').a['id'].split('-')[1]
    years.append(int(year))
    
    title = movie.contents[2].text.strip()
    movie_titles.append(title)
    
    test_score = movie.a.img['alt'][2]
    test_scores.append(int(test_score))
    
    passing = movie.a.img['src'].split('/')[2].split('.')[0]
    pass_or_no_pass.append(passing)
    
    imdb_id = movie.a['href'].split('/')[4]
    imdb_ids.append(imdb_id)
    
    imdb_link = movie.a['href']
    imdb_links.append(imdb_link)

In [71]:
test_df = pd.DataFrame({'year': years,
                       'title': movie_titles,
                       'score': test_scores,
                       'passing': pass_or_no_pass,
                       'imdb_id': imdb_ids,
                       'imdb_link': imdb_links})

In [72]:
test_df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,pass,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,pass,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,pass,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,nopass,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,nopass,tt2386490,http://us.imdb.com/title/tt2386490/


In [73]:
test_df.tail()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
195,2017,Call Me by Your Name,3,pass,tt5726616,http://us.imdb.com/title/tt5726616/
196,2017,Canaries,3,pass,tt4082644,http://us.imdb.com/title/tt4082644/
197,2017,Captain Underpants: The First Epic Movie,1,nopass,tt2091256,http://us.imdb.com/title/tt2091256/
198,2017,Cars 3,3,pass,tt3606752,http://us.imdb.com/title/tt3606752/
199,2017,Cherry Pop,3,pass,tt4807950,http://us.imdb.com/title/tt4807950/


In [74]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
year         200 non-null int64
title        200 non-null object
score        200 non-null int64
passing      200 non-null object
imdb_id      200 non-null object
imdb_link    200 non-null object
dtypes: int64(2), object(4)
memory usage: 9.5+ KB


#### B. Scrape Entire Site

In [82]:
from time import sleep, time
from random import randint
from requests import get
from warnings import warn

# warn("Warning Simulation")

In [77]:
# page format - first page was page 0 I guess
# https://bechdeltest.com/?page=1
# last page is page 40
# use clear_output(wait=True) at end of loop if scraping something really big
# from Ipython.core.display import clear_output

In [80]:
# pages = [str(i) for i in range(1,41)]

start = time()
requests = 0

# reset lists
years = []
movie_titles = []
test_scores = []
pass_or_no_pass = []
imdb_ids = []
imdb_links = []

# for r in range(41):
for r in range(5):
    # request
    
    requests += 1
    sleep(randint(1,5))
    elapsed = time() - start
    
    print(f'Request: {requests} Frequency: {round(requests/elapsed, 3)} requests/second')

Request: 1 Frequency: 0.333 requests/second
Request: 2 Frequency: 0.499 requests/second
Request: 3 Frequency: 0.374 requests/second
Request: 4 Frequency: 0.363 requests/second
Request: 5 Frequency: 0.357 requests/second


In [83]:
# reset lists

years = []
movie_titles = []
test_scores = []
pass_or_no_pass = []
imdb_ids = []
imdb_links = []

start = time()
requests = 0

# first page will execute outside of loop due to different urls

base_url = 'https://bechdeltest.com/'
response = get(base_url)

# I swear I am not a robot beep boop beep
sleep(randint(8,15))

requests += 1
elapsed = time() - start
print(f'Request: {requests} Frequency: {round(requests/elapsed, 3)} requests/second')

# warning for non-200 status codes
if response.status_code != 200:
    warn(f'Request: {requests} Status code: {response.status_code}')
    
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.findAll('div', {'class': 'movie'})

for movie in movies:
    
    year = movie.find_previous('h3').a['id'].split('-')[1]
    years.append(int(year))
    
    title = movie.contents[2].text.strip()
    movie_titles.append(title)
    
    test_score = movie.a.img['alt'][2]
    test_scores.append(int(test_score))
    
    passing = movie.a.img['src'].split('/')[2].split('.')[0]
    pass_or_no_pass.append(passing)
    
    imdb_id = movie.a['href'].split('/')[4]
    imdb_ids.append(imdb_id)
    
    imdb_link = movie.a['href']
    imdb_links.append(imdb_link)
    
# now start with page 1
pages = [str(i) for i in range(1,41)]

for page in pages:
    
    response = get(base_url + '?page=' + page)
    
    # repeat sleeping, print statements, and warnings
    sleep(randint(8,15))

    requests += 1
    elapsed = time() - start
    print(f'Request: {requests} Frequency: {round(requests/elapsed, 3)} requests/second')

    
    if response.status_code != 200:
        warn(f'Request: {requests} Status code: {response.status_code}')
        
    if requests > 42:
        warn('Number of requests greater than expected.')
        break
    
    soup = BeautifulSoup(response.text, 'html.parser')
    movies = soup.findAll('div', {'class': 'movie'})
    
    for movie in movies:
        
        year = movie.find_previous('h3').a['id'].split('-')[1]
        years.append(int(year))
    
        title = movie.contents[2].text.strip()
        movie_titles.append(title)
    
        test_score = movie.a.img['alt'][2]
        test_scores.append(int(test_score))
    
        passing = movie.a.img['src'].split('/')[2].split('.')[0]
        pass_or_no_pass.append(passing)
    
        imdb_id = movie.a['href'].split('/')[4]
        imdb_ids.append(imdb_id)
    
        imdb_link = movie.a['href']
        imdb_links.append(imdb_link)

Request: 1 Frequency: 0.065 requests/second
Request: 2 Frequency: 0.064 requests/second
Request: 3 Frequency: 0.062 requests/second
Request: 4 Frequency: 0.067 requests/second
Request: 5 Frequency: 0.068 requests/second
Request: 6 Frequency: 0.067 requests/second
Request: 7 Frequency: 0.068 requests/second
Request: 8 Frequency: 0.066 requests/second
Request: 9 Frequency: 0.066 requests/second
Request: 10 Frequency: 0.065 requests/second
Request: 11 Frequency: 0.064 requests/second
Request: 12 Frequency: 0.064 requests/second
Request: 13 Frequency: 0.064 requests/second
Request: 14 Frequency: 0.064 requests/second
Request: 15 Frequency: 0.064 requests/second
Request: 16 Frequency: 0.065 requests/second
Request: 17 Frequency: 0.066 requests/second
Request: 18 Frequency: 0.065 requests/second
Request: 19 Frequency: 0.065 requests/second
Request: 20 Frequency: 0.065 requests/second
Request: 21 Frequency: 0.065 requests/second
Request: 22 Frequency: 0.066 requests/second
Request: 23 Frequen

In [None]:
# add total elapsed time for next site

In [84]:
bechdel_df = pd.DataFrame({'year': years, 
                           'title': movie_titles,
                           'score': test_scores,
                           'passing': pass_or_no_pass,
                           'imdb_id': imdb_ids,
                           'imdb_link': imdb_links})

In [85]:
bechdel_df.head(15)

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,pass,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,pass,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,pass,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,nopass,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,nopass,tt2386490,http://us.imdb.com/title/tt2386490/
5,2019,The Kid Who Would Be King,1,nopass,tt6811018,http://us.imdb.com/title/tt6811018/
6,2019,The Lego Movie 2: The Second Part,3,pass,tt3513498,http://us.imdb.com/title/tt3513498/
7,2019,Polar,2,nopass,tt4139588,http://us.imdb.com/title/tt4139588/
8,2019,Serenity,1,nopass,tt6476140,http://us.imdb.com/title/tt6476140/
9,2019,Soni,3,pass,tt6078866,http://us.imdb.com/title/tt6078866/


In [86]:
bechdel_df.tail()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
8047,1896,Une nuit terrible,0,nopass,tt0000131,http://us.imdb.com/title/tt0000131/
8048,1895,"The Execution of Mary, Queen of Scots",0,nopass,tt0132134,http://us.imdb.com/title/tt0132134/
8049,1895,Tables Turned on the Gardener,0,nopass,tt0000014,http://us.imdb.com/title/tt0000014/
8050,1892,Pauvre Pierrot,0,nopass,tt0000003,http://us.imdb.com/title/tt0000003/
8051,1888,Roundhay Garden Scene,0,nopass,tt0392728,http://us.imdb.com/title/tt0392728/


In [87]:
bechdel_df.shape

(8052, 6)

In [88]:
bechdel_df.to_csv('my_data/bechdel_movies.csv', index=False)

In [None]:
# aw man I forgot to change the passing column to binary

In [89]:
bechdel_df.passing.replace({'nopass': 0, 'pass': 1}, inplace=True)

In [90]:
bechdel_df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,1,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,1,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,1,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,0,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,0,tt2386490,http://us.imdb.com/title/tt2386490/


In [95]:
dupes = bechdel_df.duplicated()

In [99]:
for idx, status in enumerate(dupes):
    if status == True:
        print(idx)

2316
5220


In [101]:
bechdel_df.iloc[2316]

year                                        2011
title                     Last Call at the Oasis
score                                          3
passing                                        1
imdb_id                                tt2043900
imdb_link    http://us.imdb.com/title/tt2043900/
Name: 2316, dtype: object

In [102]:
bechdel_df.iloc[5220]

year                                        1997
title                                      Ayneh
score                                          3
passing                                        1
imdb_id                                tt0117056
imdb_link    http://us.imdb.com/title/tt0117056/
Name: 5220, dtype: object

In [103]:
bechdel_df.loc[bechdel_df.imdb_id == 'tt2043900']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
2315,2011,Last Call at the Oasis,3,1,tt2043900,http://us.imdb.com/title/tt2043900/
2316,2011,Last Call at the Oasis,3,1,tt2043900,http://us.imdb.com/title/tt2043900/


In [104]:
bechdel_df.loc[bechdel_df.imdb_id == 'tt0117056']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
5219,1997,Ayneh,3,1,tt0117056,http://us.imdb.com/title/tt0117056/
5220,1997,Ayneh,3,1,tt0117056,http://us.imdb.com/title/tt0117056/


In [105]:
bechdel_df.drop_duplicates(inplace=True)

In [106]:
bechdel_df.shape

(8050, 6)

In [107]:
bechdel_df.duplicated().sum()

0

In [108]:
bechdel_df.to_csv('my_data/bechdel_test_movies.csv', index=False)