## Web Scraping with Beautiful Soup

In [2]:
import pandas as pd

from bs4 import BeautifulSoup
from urllib.request import urlopen

## I. Bechdel Test website

In [3]:
base_url = 'https://bechdeltest.com/'

# open connection, grab page
client = urlopen(base_url)

# store html in variable
base_html = client.read()

# close connection
client.close()

In [4]:
# parse html
soup = BeautifulSoup(base_html, 'html.parser')

In [5]:
soup.h1

<h1 style="margin-bottom: 5px"><a href="/" style="text-decoration:none;">Bechdel Test Movie List</a></h1>

In [6]:
soup.p

<p>The <b>Bechdel Test</b>, sometimes called the <i>Mo Movie Measure</i> or <i>Bechdel Rule</i> is a simple test which names the following three criteria: (1) it has to have at least two women in it, who (2) who talk to each other, about (3) something besides a man. The test was popularized by <b>Alison Bechdel</b>'s comic <b>Dykes to Watch Out For</b>, in a 1985 strip called <a href="http://alisonbechdel.blogspot.com/2005/08/rule.html"><i>The Rule</i></a>. For a nice video introduction to the subject please check out <a href="http://www.feministfrequency.com/2009/12/the-bechdel-test-for-women-in-movies/">The Bechdel Test for Women in Movies</a> on <a href="http://www.feministfrequency.com/">feministfrequency.com</a>.</p>

In [7]:
# grab each movie
movies = soup.findAll('div', {'class': 'movie'})

In [8]:
len(movies)

200

In [9]:
movies[0]

<div class="movie"><a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>
<a href="/view/8642/alita:_battle_angel/" id="movie-8642">Alita: Battle Angel</a> <a href="/view/8642/alita:_battle_angel/" onclick="showComments('8642'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8642" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a> </div>

In [10]:
# movies object contains imdb id (within imdb link), Bechdel Test score, 'pass' or 'nopass', movie title in 3 
# different formats, and number of comments
movie = movies[0]

In [34]:
len(movie.contents)

6

In [35]:
movie.contents

[<a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>,
 '\n',
 <a href="/view/8642/alita:_battle_angel/" id="movie-8642">Alita: Battle Angel</a>,
 ' ',
 <a href="/view/8642/alita:_battle_angel/" onclick="showComments('8642'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8642" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a>,
 ' ']

In [47]:
movie.parent.h3.a['id'].split('-')[1]

'2019'

In [39]:
movies[1].contents[2].text

'American Woman'

In [41]:
movies[75].contents[2].text.strip()

'Hold the Dark'

In [11]:
movie.a

<a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>

In [16]:
movie.a.img

<img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/>

In [18]:
movie.a.img['alt']

'[[3]]'

In [19]:
# yessss! the score!!! 
movie.a.img['alt'][2]

'3'

In [20]:
movie.a['href']

'http://us.imdb.com/title/tt0437086/'

In [21]:
movie.a['href'].split('/')

['http:', '', 'us.imdb.com', 'title', 'tt0437086', '']

In [22]:
movie.a['href'].split('/')[4]

'tt0437086'

In [28]:
movie.a.img['src'].split('/')[2].split('.')[0]

'pass'

In [17]:
movies[3]

<div class="movie"><a href="http://us.imdb.com/title/tt5719748/"><img alt="[[1]]" src="/static/nopass.png" title="[There are two or more women in this movie, but they don't talk to each other]"/></a>
<a href="/view/8639/cold_pursuit/" id="movie-8639">Cold Pursuit</a> <a href="/view/8639/cold_pursuit/" onclick="showComments('8639'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8639" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a> </div>

In [29]:
movies[3].a.img['src'].split('/')[2].split('.')[0]

'nopass'

In [48]:
movies[100].parent.h3.a['id'].split('-')[1]

'2019'

In [66]:
movies[100].find_previous('h3').a['id'].split('-')[1]

'2018'

In [67]:
movies[199].find_previous('h3').a['id'].split('-')[1]

'2017'

In [58]:
int(movies[10].a.img['alt'][2])

3

In [49]:
movies[100].contents[2].text.strip()

'The Miseducation of Cameron Post'

In [None]:
# could write directly to csv, but I'm not going to
# filename = 'bechdel_test_movies.csv'
# f = open(filename, 'w')
# headers = 'imdb_id, imdb_link, test_score, passing, title\n'
# f.write(headers)

# build loop
# if writing to pandas df, change year and test_score to integers

# for movie in movies:
#     year = movie.find_previous('h3').a['id'].split('-')[1]
#     title = movies.contents[2].text.strip()
#     test_score = movie.a.img['alt'][2]
#     passing = movie.a.img['src'].split('/')[2].split('.')[0]
#     imdb_id = movie.a['href'].split('/')[4]
#     imdb_link = movie.a['href']
    
    # f.write(imdb_id + ',' + imdb_link + ',' + test_score + ',' + passing.replace(',','|') + ',' + 'title' + '\n')
    # if a title has any punctuation. . . this may not work, should probably go into a dataframe
# f.close()

In [69]:
# test run on page one (sorting by year)

years = []
movie_titles = []
test_scores = []
pass_or_no_pass = []
imdb_ids = []
imdb_links = []

for movie in movies:
    
    year = movie.find_previous('h3').a['id'].split('-')[1]
    years.append(int(year))
    
    title = movie.contents[2].text.strip()
    movie_titles.append(title)
    
    test_score = movie.a.img['alt'][2]
    test_scores.append(int(test_score))
    
    passing = movie.a.img['src'].split('/')[2].split('.')[0]
    pass_or_no_pass.append(passing)
    
    imdb_id = movie.a['href'].split('/')[4]
    imdb_ids.append(imdb_id)
    
    imdb_link = movie.a['href']
    imdb_links.append(imdb_link)

In [71]:
test_df = pd.DataFrame({'year': years,
                       'title': movie_titles,
                       'score': test_scores,
                       'passing': pass_or_no_pass,
                       'imdb_id': imdb_ids,
                       'imdb_link': imdb_links})

In [72]:
test_df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,pass,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,pass,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,pass,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,nopass,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,nopass,tt2386490,http://us.imdb.com/title/tt2386490/
