## Web Scraping with Beautiful Soup

In [1]:
import pandas as pd

from bs4 import BeautifulSoup
from random import randint
from requests import get
from time import sleep, time
from warnings import warn

## I. Bechdel Test website

#### A. First Page Trial Run

In [3]:
base_url = 'https://bechdeltest.com/'

# open connection, grab page
client = urlopen(base_url)

# store html in variable
base_html = client.read()

# close connection
client.close()

In [4]:
# parse html
soup = BeautifulSoup(base_html, 'html.parser')

In [5]:
soup.h1

<h1 style="margin-bottom: 5px"><a href="/" style="text-decoration:none;">Bechdel Test Movie List</a></h1>

In [6]:
soup.p

<p>The <b>Bechdel Test</b>, sometimes called the <i>Mo Movie Measure</i> or <i>Bechdel Rule</i> is a simple test which names the following three criteria: (1) it has to have at least two women in it, who (2) who talk to each other, about (3) something besides a man. The test was popularized by <b>Alison Bechdel</b>'s comic <b>Dykes to Watch Out For</b>, in a 1985 strip called <a href="http://alisonbechdel.blogspot.com/2005/08/rule.html"><i>The Rule</i></a>. For a nice video introduction to the subject please check out <a href="http://www.feministfrequency.com/2009/12/the-bechdel-test-for-women-in-movies/">The Bechdel Test for Women in Movies</a> on <a href="http://www.feministfrequency.com/">feministfrequency.com</a>.</p>

In [7]:
# grab each movie
movies = soup.findAll('div', {'class': 'movie'})

In [8]:
len(movies)

200

In [9]:
movies[0]

<div class="movie"><a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>
<a href="/view/8642/alita:_battle_angel/" id="movie-8642">Alita: Battle Angel</a> <a href="/view/8642/alita:_battle_angel/" onclick="showComments('8642'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8642" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a> </div>

In [10]:
# movies object contains imdb id (within imdb link), Bechdel Test score, 'pass' or 'nopass', movie title in 3 
# different formats, and number of comments
movie = movies[0]

In [34]:
len(movie.contents)

6

In [35]:
movie.contents

[<a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>,
 '\n',
 <a href="/view/8642/alita:_battle_angel/" id="movie-8642">Alita: Battle Angel</a>,
 ' ',
 <a href="/view/8642/alita:_battle_angel/" onclick="showComments('8642'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8642" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a>,
 ' ']

In [47]:
movie.parent.h3.a['id'].split('-')[1]

'2019'

In [39]:
movies[1].contents[2].text

'American Woman'

In [41]:
movies[75].contents[2].text.strip()

'Hold the Dark'

In [11]:
movie.a

<a href="http://us.imdb.com/title/tt0437086/"><img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/></a>

In [16]:
movie.a.img

<img alt="[[3]]" src="/static/pass.png" title="[There are two or more women in this movie and they talk to each other about something other than a man]"/>

In [18]:
movie.a.img['alt']

'[[3]]'

In [19]:
# yessss! the score!!! 
movie.a.img['alt'][2]

'3'

In [20]:
movie.a['href']

'http://us.imdb.com/title/tt0437086/'

In [21]:
movie.a['href'].split('/')

['http:', '', 'us.imdb.com', 'title', 'tt0437086', '']

In [22]:
movie.a['href'].split('/')[4]

'tt0437086'

In [28]:
movie.a.img['src'].split('/')[2].split('.')[0]

'pass'

In [17]:
movies[3]

<div class="movie"><a href="http://us.imdb.com/title/tt5719748/"><img alt="[[1]]" src="/static/nopass.png" title="[There are two or more women in this movie, but they don't talk to each other]"/></a>
<a href="/view/8639/cold_pursuit/" id="movie-8639">Cold Pursuit</a> <a href="/view/8639/cold_pursuit/" onclick="showComments('8639'); return false;"><img alt="[1 comment(s) available]" id="comment-img-8639" src="/static/comments.png" style="height: 10px; width: 10px;" title="1 comment"/></a> </div>

In [29]:
movies[3].a.img['src'].split('/')[2].split('.')[0]

'nopass'

In [48]:
movies[100].parent.h3.a['id'].split('-')[1]

'2019'

In [66]:
movies[100].find_previous('h3').a['id'].split('-')[1]

'2018'

In [67]:
movies[199].find_previous('h3').a['id'].split('-')[1]

'2017'

In [58]:
int(movies[10].a.img['alt'][2])

3

In [49]:
movies[100].contents[2].text.strip()

'The Miseducation of Cameron Post'

In [None]:
# could write directly to csv, but I'm not going to
# filename = 'bechdel_test_movies.csv'
# f = open(filename, 'w')
# headers = 'imdb_id, imdb_link, test_score, passing, title\n'
# f.write(headers)

# build loop
# if writing to pandas df, change year and test_score to integers

# for movie in movies:
#     year = movie.find_previous('h3').a['id'].split('-')[1]
#     title = movies.contents[2].text.strip()
#     test_score = movie.a.img['alt'][2]
#     passing = movie.a.img['src'].split('/')[2].split('.')[0]
#     imdb_id = movie.a['href'].split('/')[4]
#     imdb_link = movie.a['href']
    
    # f.write(imdb_id + ',' + imdb_link + ',' + test_score + ',' + passing.replace(',','|') + ',' + 'title' + '\n')
    # if a title has any punctuation. . . this may not work, should probably go into a dataframe
# f.close()

In [69]:
# test run on page one (sorting by year)

years = []
movie_titles = []
test_scores = []
pass_or_no_pass = []
imdb_ids = []
imdb_links = []

for movie in movies:
    
    year = movie.find_previous('h3').a['id'].split('-')[1]
    years.append(int(year))
    
    title = movie.contents[2].text.strip()
    movie_titles.append(title)
    
    test_score = movie.a.img['alt'][2]
    test_scores.append(int(test_score))
    
    passing = movie.a.img['src'].split('/')[2].split('.')[0]
    pass_or_no_pass.append(passing)
    
    imdb_id = movie.a['href'].split('/')[4]
    imdb_ids.append(imdb_id)
    
    imdb_link = movie.a['href']
    imdb_links.append(imdb_link)

In [71]:
test_df = pd.DataFrame({'year': years,
                       'title': movie_titles,
                       'score': test_scores,
                       'passing': pass_or_no_pass,
                       'imdb_id': imdb_ids,
                       'imdb_link': imdb_links})

In [72]:
test_df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,pass,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,pass,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,pass,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,nopass,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,nopass,tt2386490,http://us.imdb.com/title/tt2386490/


In [73]:
test_df.tail()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
195,2017,Call Me by Your Name,3,pass,tt5726616,http://us.imdb.com/title/tt5726616/
196,2017,Canaries,3,pass,tt4082644,http://us.imdb.com/title/tt4082644/
197,2017,Captain Underpants: The First Epic Movie,1,nopass,tt2091256,http://us.imdb.com/title/tt2091256/
198,2017,Cars 3,3,pass,tt3606752,http://us.imdb.com/title/tt3606752/
199,2017,Cherry Pop,3,pass,tt4807950,http://us.imdb.com/title/tt4807950/


In [74]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
year         200 non-null int64
title        200 non-null object
score        200 non-null int64
passing      200 non-null object
imdb_id      200 non-null object
imdb_link    200 non-null object
dtypes: int64(2), object(4)
memory usage: 9.5+ KB


#### B. Scrape Entire Site

In [82]:
from time import sleep, time
from random import randint
from requests import get
from warnings import warn

# warn("Warning Simulation")

In [77]:
# page format - first page was page 0 I guess
# https://bechdeltest.com/?page=1
# last page is page 40
# use clear_output(wait=True) at end of loop if scraping something really big
# from Ipython.core.display import clear_output

In [80]:
# pages = [str(i) for i in range(1,41)]

start = time()
requests = 0

# reset lists
years = []
movie_titles = []
test_scores = []
pass_or_no_pass = []
imdb_ids = []
imdb_links = []

# for r in range(41):
for r in range(5):
    # request
    
    requests += 1
    sleep(randint(1,5))
    elapsed = time() - start
    
    print(f'Request: {requests} Frequency: {round(requests/elapsed, 3)} requests/second')

Request: 1 Frequency: 0.333 requests/second
Request: 2 Frequency: 0.499 requests/second
Request: 3 Frequency: 0.374 requests/second
Request: 4 Frequency: 0.363 requests/second
Request: 5 Frequency: 0.357 requests/second


In [83]:
# reset lists

years = []
movie_titles = []
test_scores = []
pass_or_no_pass = []
imdb_ids = []
imdb_links = []

start = time()
requests = 0

# first page will execute outside of loop due to different urls

base_url = 'https://bechdeltest.com/'
response = get(base_url)

# I swear I am not a robot beep boop beep
sleep(randint(8,15))

requests += 1
elapsed = time() - start
print(f'Request: {requests} Frequency: {round(requests/elapsed, 3)} requests/second')

# warning for non-200 status codes
if response.status_code != 200:
    warn(f'Request: {requests} Status code: {response.status_code}')
    
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.findAll('div', {'class': 'movie'})

for movie in movies:
    
    year = movie.find_previous('h3').a['id'].split('-')[1]
    years.append(int(year))
    
    title = movie.contents[2].text.strip()
    movie_titles.append(title)
    
    test_score = movie.a.img['alt'][2]
    test_scores.append(int(test_score))
    
    passing = movie.a.img['src'].split('/')[2].split('.')[0]
    pass_or_no_pass.append(passing)
    
    imdb_id = movie.a['href'].split('/')[4]
    imdb_ids.append(imdb_id)
    
    imdb_link = movie.a['href']
    imdb_links.append(imdb_link)
    
# now start with page 1
pages = [str(i) for i in range(1,41)]

for page in pages:
    
    response = get(base_url + '?page=' + page)
    
    # repeat sleeping, print statements, and warnings
    sleep(randint(8,15))

    requests += 1
    elapsed = time() - start
    print(f'Request: {requests} Frequency: {round(requests/elapsed, 3)} requests/second')

    
    if response.status_code != 200:
        warn(f'Request: {requests} Status code: {response.status_code}')
        
    if requests > 42:
        warn('Number of requests greater than expected.')
        break
    
    soup = BeautifulSoup(response.text, 'html.parser')
    movies = soup.findAll('div', {'class': 'movie'})
    
    for movie in movies:
        
        year = movie.find_previous('h3').a['id'].split('-')[1]
        years.append(int(year))
    
        title = movie.contents[2].text.strip()
        movie_titles.append(title)
    
        test_score = movie.a.img['alt'][2]
        test_scores.append(int(test_score))
    
        passing = movie.a.img['src'].split('/')[2].split('.')[0]
        pass_or_no_pass.append(passing)
    
        imdb_id = movie.a['href'].split('/')[4]
        imdb_ids.append(imdb_id)
    
        imdb_link = movie.a['href']
        imdb_links.append(imdb_link)

Request: 1 Frequency: 0.065 requests/second
Request: 2 Frequency: 0.064 requests/second
Request: 3 Frequency: 0.062 requests/second
Request: 4 Frequency: 0.067 requests/second
Request: 5 Frequency: 0.068 requests/second
Request: 6 Frequency: 0.067 requests/second
Request: 7 Frequency: 0.068 requests/second
Request: 8 Frequency: 0.066 requests/second
Request: 9 Frequency: 0.066 requests/second
Request: 10 Frequency: 0.065 requests/second
Request: 11 Frequency: 0.064 requests/second
Request: 12 Frequency: 0.064 requests/second
Request: 13 Frequency: 0.064 requests/second
Request: 14 Frequency: 0.064 requests/second
Request: 15 Frequency: 0.064 requests/second
Request: 16 Frequency: 0.065 requests/second
Request: 17 Frequency: 0.066 requests/second
Request: 18 Frequency: 0.065 requests/second
Request: 19 Frequency: 0.065 requests/second
Request: 20 Frequency: 0.065 requests/second
Request: 21 Frequency: 0.065 requests/second
Request: 22 Frequency: 0.066 requests/second
Request: 23 Frequen

In [None]:
# add total elapsed time for next site

#### C. Store in pandas DataFrame

In [84]:
bechdel_df = pd.DataFrame({'year': years, 
                           'title': movie_titles,
                           'score': test_scores,
                           'passing': pass_or_no_pass,
                           'imdb_id': imdb_ids,
                           'imdb_link': imdb_links})

In [85]:
bechdel_df.head(15)

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,pass,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,pass,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,pass,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,nopass,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,nopass,tt2386490,http://us.imdb.com/title/tt2386490/
5,2019,The Kid Who Would Be King,1,nopass,tt6811018,http://us.imdb.com/title/tt6811018/
6,2019,The Lego Movie 2: The Second Part,3,pass,tt3513498,http://us.imdb.com/title/tt3513498/
7,2019,Polar,2,nopass,tt4139588,http://us.imdb.com/title/tt4139588/
8,2019,Serenity,1,nopass,tt6476140,http://us.imdb.com/title/tt6476140/
9,2019,Soni,3,pass,tt6078866,http://us.imdb.com/title/tt6078866/


In [86]:
bechdel_df.tail()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
8047,1896,Une nuit terrible,0,nopass,tt0000131,http://us.imdb.com/title/tt0000131/
8048,1895,"The Execution of Mary, Queen of Scots",0,nopass,tt0132134,http://us.imdb.com/title/tt0132134/
8049,1895,Tables Turned on the Gardener,0,nopass,tt0000014,http://us.imdb.com/title/tt0000014/
8050,1892,Pauvre Pierrot,0,nopass,tt0000003,http://us.imdb.com/title/tt0000003/
8051,1888,Roundhay Garden Scene,0,nopass,tt0392728,http://us.imdb.com/title/tt0392728/


In [87]:
bechdel_df.shape

(8052, 6)

In [88]:
bechdel_df.to_csv('my_data/bechdel_movies.csv', index=False)

In [None]:
# aw man I forgot to change the passing column to binary

#### D. Convert `passing` values to binary

In [89]:
bechdel_df.passing.replace({'nopass': 0, 'pass': 1}, inplace=True)

In [90]:
bechdel_df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,1,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,1,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,1,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,0,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,0,tt2386490,http://us.imdb.com/title/tt2386490/


#### E. Find and Drop Duplicate Entries

In [95]:
dupes = bechdel_df.duplicated()

In [99]:
for idx, status in enumerate(dupes):
    if status == True:
        print(idx)

2316
5220


In [101]:
bechdel_df.iloc[2316]

year                                        2011
title                     Last Call at the Oasis
score                                          3
passing                                        1
imdb_id                                tt2043900
imdb_link    http://us.imdb.com/title/tt2043900/
Name: 2316, dtype: object

In [102]:
bechdel_df.iloc[5220]

year                                        1997
title                                      Ayneh
score                                          3
passing                                        1
imdb_id                                tt0117056
imdb_link    http://us.imdb.com/title/tt0117056/
Name: 5220, dtype: object

In [103]:
bechdel_df.loc[bechdel_df.imdb_id == 'tt2043900']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
2315,2011,Last Call at the Oasis,3,1,tt2043900,http://us.imdb.com/title/tt2043900/
2316,2011,Last Call at the Oasis,3,1,tt2043900,http://us.imdb.com/title/tt2043900/


In [104]:
bechdel_df.loc[bechdel_df.imdb_id == 'tt0117056']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
5219,1997,Ayneh,3,1,tt0117056,http://us.imdb.com/title/tt0117056/
5220,1997,Ayneh,3,1,tt0117056,http://us.imdb.com/title/tt0117056/


In [105]:
bechdel_df.drop_duplicates(inplace=True)

In [106]:
bechdel_df.shape

(8050, 6)

In [107]:
bechdel_df.duplicated().sum()

0

#### F. Save to `.csv` file

In [108]:
bechdel_df.to_csv('my_data/bechdel_test_movies.csv', index=False)

## II. IMDB 

In [2]:
df = pd.read_csv('my_data/bechdel_test_movies.csv')
df.head()

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
0,2019,Alita: Battle Angel,3,1,tt0437086,http://us.imdb.com/title/tt0437086/
1,2019,American Woman,3,1,tt9109492,http://us.imdb.com/title/tt9109492/
2,2019,Close,3,1,tt5316540,http://us.imdb.com/title/tt5316540/
3,2019,Cold Pursuit,1,0,tt5719748,http://us.imdb.com/title/tt5719748/
4,2019,How to Train Your Dragon: The Hidden World,2,0,tt2386490,http://us.imdb.com/title/tt2386490/


#### A. Grab attributes from one movie: Hurt Locker

In [5]:
df.loc[df.title == 'The Hurt Locker']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
3275,2008,The Hurt Locker,1,0,tt0887912,http://us.imdb.com/title/tt0887912/


In [6]:
from datetime import datetime

In [8]:
start = datetime.now()
sleep(5)
elapsed = datetime.now() - start
print(elapsed)

0:00:05.002150


In [10]:
test_url = df.imdb_link.iloc[3275]
response = get(test_url)
soup = BeautifulSoup(response.text, 'html.parser')    

In [39]:
# soup.findAll('div', {'class': 'main', 'id': 'main_top'})

In [None]:
# imdb is not going to like 8000+ requests from me. . . I think wikipedia might be a better bet and seems to
# contain the same information

## III. Wikipedia

In [12]:
df.loc[df.title == 'The Hurt Locker']

Unnamed: 0,year,title,score,passing,imdb_id,imdb_link
3275,2008,The Hurt Locker,1,0,tt0887912,http://us.imdb.com/title/tt0887912/


In [13]:
test_url = 'https://en.wikipedia.org/wiki/The_Hurt_Locker'
response = get(test_url)
soup = BeautifulSoup(response.text, 'html.parser')  

In [16]:
info_box = soup.findAll('table', {'class': 'infobox vevent'})
info_box

[<table class="infobox vevent" style="width:22em;font-size:90%;"><tbody><tr><th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">The Hurt Locker</th></tr><tr><td colspan="2" style="text-align:center"><a class="image" href="/wiki/File:HLposterUSA2.jpg" title="From above a flat. and dry desert floor, a person in a green military uniform with heavy padding holds red wires attached to seven pill-shaped bomb canisters scattered around him. At the top of the poster are three critics' favorable opinions: &quot;A near-perfect movie&quot;, &quot;A full-tilt action picture&quot;, and &quot;Ferociously suspenseful&quot;. Below the quotes is the title &quot;THE HURT LOCKER&quot; and the tagline, &quot;You don't have to be a hero to do this job. But it helps.&quot;"><img alt="From above a flat. and dry desert floor, a person in a green military uniform with heavy padding holds red wires attached to seven pill-shaped bomb caniste

In [52]:
info_box[0].findAll('tr', {'class': 'thumbborder'})

[]

In [54]:
import wptools

In [57]:
p = wptools.page('The Hurt Locker').get_parse()

en.wikipedia.org (parse) The Hurt Locker
en.wikipedia.org (imageinfo) File:HLposterUSA2.jpg
The Hurt Locker (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:HLposterU...
  infobox: <dict(18)> name, image, alt, caption, director, produce...
  iwlinks: <list(2)> https://en.wikiquote.org/wiki/Special:Search/...
  pageid: 16733548
  parsetree: <str(93950)> <root><template><title>Use mdy dates</ti...
  requests: <list(2)> parse, imageinfo
  title: The Hurt Locker
  wikibase: Q183066
  wikidata_url: https://www.wikidata.org/wiki/Q183066
  wikitext: <str(76614)> {{Use mdy dates|date=August 2017}}{{Infob...
}


In [72]:
p.data['infobox']

{'name': 'The Hurt Locker',
 'image': 'HLposterUSA2.jpg',
 'alt': 'From above a flat. and dry desert floor, a person in a green military uniform with heavy padding holds red wires attached to seven pill-shaped bomb canisters scattered around him. At the top of the poster are three critics\' favorable opinions: "A near-perfect movie", "A full-tilt action picture", and "Ferociously suspenseful". Below the quotes is the title "THE HURT LOCKER" and the tagline, "You don\'t have to be a hero to do this job. But it helps."',
 'caption': 'Theatrical release poster',
 'director': '[[Kathryn Bigelow]]',
 'producer': '{{plainlist|\n* Kathryn Bigelow\n* [[Mark Boal]]\n* [[Nicolas Chartier]]\n* Greg Shapiro}}',
 'writer': 'Mark Boal',
 'starring': '{{plainlist|\n* [[Jeremy Renner]]\n* [[Anthony Mackie]]\n* [[Brian Geraghty]]\n* [[Evangeline Lilly]]\n* [[Ralph Fiennes]]\n* [[David Morse (actor)|David Morse]]\n* [[Guy Pearce]]}}',
 'cinematography': '[[Barry Ackroyd]]',
 'editing': '{{plainlist|\n* 

In [74]:
p.get_wikidata()

www.wikidata.org (wikidata) Q183066
www.wikidata.org (labels) P345|P3107|P373|Q369747|Q1860|Q34816|P1...
www.wikidata.org (labels) Q465754|Q20644797|Q22006653|P31|P840|P4...
www.wikidata.org (labels) Q1077361|P3808|P344|Q3990883|P1552|P452...
en.wikipedia.org (imageinfo) File:The hurt locker.svg
The Hurt Locker (en) data
{
  aliases: <list(1)> Hurt Locker
  claims: <dict(83)> P905, P57, P31, P345, P480, P373, P161, P58, ...
  description: 2008 American war film directed by Kathryn Bigelow
  image: <list(2)> {'kind': 'parse-image', 'file': 'File:HLposterU...
  infobox: <dict(18)> name, image, alt, caption, director, produce...
  iwlinks: <list(2)> https://en.wikiquote.org/wiki/Special:Search/...
  label: The Hurt Locker
  labels: <dict(127)> P345, P3107, P373, Q369747, Q1860, Q34816, P...
  modified: <dict(1)> wikidata
  pageid: 16733548
  parsetree: <str(93950)> <root><template><title>Use mdy dates</ti...
  requests: <list(7)> parse, imageinfo, wikidata, labels, labels, ...
  title: Th

<wptools.page.WPToolsPage at 0x11c889208>

In [75]:
p.data['wikidata']

{'PORT film ID (P905)': '96796',
 'director (P57)': 'Kathryn Bigelow (Q34816)',
 'instance of (P31)': 'film (Q11424)',
 'IMDb ID (P345)': 'tt0887912',
 'FilmAffinity ID (P480)': '588031',
 'Commons category (P373)': 'The Hurt Locker',
 'cast member (P161)': ['Jeremy Renner (Q23365)',
  'Anthony Mackie (Q511554)',
  'Brian Geraghty (Q460563)',
  'Christian Camargo (Q456291)',
  'Evangeline Lilly (Q160392)',
  'Ralph Fiennes (Q28493)',
  'Guy Pearce (Q223745)',
  'David Morse (Q296370)',
  'Malcolm Barrett (Q3330735)'],
 'screenwriter (P58)': 'Mark Boal (Q370765)',
 'producer (P162)': ['Kathryn Bigelow (Q34816)',
  'Mark Boal (Q370765)',
  'Nicolas Chartier (Q7029566)'],
 'production company (P272)': ['Summit Entertainment (Q632323)',
  'Grosvenor Park Productions (Q5610650)'],
 'director of photography (P344)': 'Barry Ackroyd (Q790767)',
 'original language of film or TV show (P364)': 'English (Q1860)',
 'genre (P136)': ['action thriller (Q3990883)',
  'war film (Q369747)',
  'independe

In [79]:
c_p = wptools.page('Cold Pursuit').get_parse()

en.wikipedia.org (parse) Cold Pursuit
en.wikipedia.org (imageinfo) File:Cold Pursuit poster.jpg
Cold Pursuit (en) data
{
  image: <list(1)> {'kind': 'parse-image', 'file': 'File:Cold Purs...
  infobox: <dict(20)> name, image, border, caption, director, prod...
  pageid: 58169961
  parsetree: <str(28152)> <root><template><title>Infobox film</tit...
  requests: <list(2)> parse, imageinfo
  title: Cold Pursuit
  wikibase: Q28912877
  wikidata_url: https://www.wikidata.org/wiki/Q28912877
  wikitext: <str(22790)> {{Infobox film| name           = Cold Pur...
}


In [81]:
c_p.data['infobox']

{'name': 'Cold Pursuit',
 'image': 'Cold Pursuit poster.jpg',
 'border': 'no',
 'caption': 'Theatrical release poster',
 'director': '[[Hans Petter Moland]]',
 'producer': '{{Plainlist|\n* Finn Gjerdrum\n* Stein B. Kvae\n* [[Michael Shamberg]]\n* Ameet Shukla}}',
 'writer': 'Frank Baldwin',
 'based on': "{{Based on|''[[In Order of Disappearance]]''|[[Kim Fupz Aakeson]]}}",
 'starring': '{{Plainlist|\n* [[Liam Neeson]] \n* [[Tom Bateman (actor)|Tom Bateman]]\n* [[Tom Jackson (actor)|Tom Jackson]]\n* [[Emmy Rossum]]\n* [[Domenick Lombardozzi]] \n* [[Julia Jones]]\n* [[John Doman]]\n* [[Laura Dern]]|<!--PER BILLING ORDER-->|}}',
 'music': '[[George Fenton]]',
 'cinematography': '[[Philip Øgaard]]',
 'editing': 'Nicolaj Monberg',
 'production companies': '{{Plainlist|\n* [[StudioCanal]]\n* [[Summit Entertainment]]}}',
 'distributor': 'Summit Entertainment',
 'released': '{{Film date|2019|2|8|United States}}',
 'runtime': '118 minutes',
 'country': 'United States',
 'language': 'English',
 

In [83]:
cp_actors = c_p.data['infobox']['starring']

In [84]:
cp_actors

'{{Plainlist|\n* [[Liam Neeson]] \n* [[Tom Bateman (actor)|Tom Bateman]]\n* [[Tom Jackson (actor)|Tom Jackson]]\n* [[Emmy Rossum]]\n* [[Domenick Lombardozzi]] \n* [[Julia Jones]]\n* [[John Doman]]\n* [[Laura Dern]]|<!--PER BILLING ORDER-->|}}'

In [85]:
cp_actors.split('\n*')

['{{Plainlist|',
 ' [[Liam Neeson]] ',
 ' [[Tom Bateman (actor)|Tom Bateman]]',
 ' [[Tom Jackson (actor)|Tom Jackson]]',
 ' [[Emmy Rossum]]',
 ' [[Domenick Lombardozzi]] ',
 ' [[Julia Jones]]',
 ' [[John Doman]]',
 ' [[Laura Dern]]|<!--PER BILLING ORDER-->|}}']

In [88]:
cp_actors.split(',').split('\n*')

AttributeError: 'list' object has no attribute 'split'