In [1]:
# all imports
from IPython.display import HTML
import numpy as np
import urllib.request as urllib2
import bs4 # beautiful soup
import time 
import re # regular expressions

import pandas as pd
from pandas import Series
from pandas import DataFrame

import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline


## Data Scraping
Data scraping is about obtaining data from webpages. There is low level scraping where you parse the data out of the html code of the webpage. There also is scraping over APIs from websites who try to make your life a bit easier.

### Read the user data

In [2]:
# pass in column names for each CSV
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('resources/ml-100k/u.user', sep='|',names=u_cols)

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Read the ratings

In [3]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings = pd.read_csv('resources/ml-100k/u.data', sep='\t',names=r_cols)

ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Now data about the movies

In [4]:
# the movies file contains columns indicating the movies's genres
# let's only load the first 5 columns of the file with usecols
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']

movies = pd.read_csv('resources/ml-100k/u.item', sep='|',names=m_cols, usecols=range(5), encoding='latin-1')

movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


### Get information about the data

In [5]:
print(movies.dtypes)
print
print(movies.describe())

movie_id                int64
title                  object
release_date           object
video_release_date    float64
imdb_url               object
dtype: object
          movie_id  video_release_date
count  1682.000000                 0.0
mean    841.500000                 NaN
std     485.695893                 NaN
min       1.000000                 NaN
25%     421.250000                 NaN
50%     841.500000                 NaN
75%    1261.750000                 NaN
max    1682.000000                 NaN


### Selecting data
- DataFrame --> group of Series with shared index
- single DataFrame column --> Series

In [6]:
print(users.head())
print('----------------')

print(users['occupation'].head())
print('----------------')

selected_columns = ['occupation', 'sex']
print(users[selected_columns].head())
print('-----------------')

print(users.iloc[3])

user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213
----------------
0    technician
1         other
2        writer
3    technician
4         other
Name: occupation, dtype: object
----------------
   occupation sex
0  technician   M
1       other   F
2      writer   M
3  technician   M
4       other   F
-----------------
user_id                4
age                   24
sex                    M
occupation    technician
zip_code           43537
Name: 3, dtype: object


### Filtering data

In [7]:
# select users older than 25
old_users = users[users.age > 25]
old_users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
1,2,53,F,other,94043
4,5,33,F,other,15213
5,6,42,M,executive,98101
6,7,57,M,administrator,91344
7,8,36,M,administrator,5201


In [8]:
# users aged 40 and male
users[(users.age == 40) & (users.sex == 'M')].head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
18,19,40,M,librarian,2138
82,83,40,M,other,44133
115,116,40,M,healthcare,97232
199,200,40,M,programmer,93402
283,284,40,M,executive,92629


In [9]:
# users who are female and programmers
selected_users = users[(users.sex == 'F') & (users.occupation == 'programmer')]

# show statistic summary
print(selected_users.describe())

# alternatives:
print(selected_users.age.mean())
print(selected_users['age'].mean())

user_id        age
count    6.000000   6.000000
mean   411.166667  32.166667
std    149.987222   5.115336
min    292.000000  26.000000
25%    313.000000  28.250000
50%    378.000000  32.000000
75%    416.750000  36.500000
max    698.000000  38.000000
32.166666666666664
32.166666666666664


### Split-apply-combine
- splitting the data into groups based on some criteria
- applying a function to each group independently
- combining the results into a data structure
![](resources/split-apply-combine.png_)

### Find diligent users
- split data per user ID
- count ratings
- combine result

In [10]:
print(ratings.head())

# split the data
# grouped_data = ratings.groupby('user_id)
grouped_data = ratings['movie_id'].groupby(ratings['user_id'])

# count and combine
ratings_per_user = grouped_data.count()

ratings_per_user.head()

user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


user_id
1    272
2     62
3     54
4     24
5    175
Name: movie_id, dtype: int64

In [11]:
# get the average rating per movie

# split the data
grouped_data = ratings['rating'].groupby(ratings['movie_id'])
# average and combine
average_ratings = grouped_data.mean()

print("Average ratings:")
print(average_ratings.head())

Average ratings:
movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64


In [12]:
max_rating = average_ratings.max()
good_movie_ids = average_ratings[average_ratings == max_rating].index

print("Good movie ids:")
print(good_movie_ids)
print()

print("Best movie titles:")
print(movies[movies.movie_id.isin(good_movie_ids)].title)
print()

Good movie ids:
Int64Index([814, 1122, 1189, 1201, 1293, 1467, 1500, 1536, 1599, 1653], dtype='int64', name='movie_id')

Best movie titles:
813                         Great Day in Harlem, A (1994)
1121                       They Made Me a Criminal (1939)
1188                                   Prefontaine (1997)
1200           Marlene Dietrich: Shadow and Light (1996) 
1292                                      Star Kid (1997)
1466                 Saint of Fort Washington, The (1993)
1499                            Santa with Muscles (1996)
1535                                 Aiqing wansui (1994)
1598                        Someone Else's America (1995)
1652    Entertaining Angels: The Dorothy Day Story (1996)
Name: title, dtype: object



In [13]:
how_many_ratings = grouped_data.count()
print("Number of ratings per movie")
print(how_many_ratings[average_ratings == max_rating])

Number of ratings per movie
movie_id
814     1
1122    1
1189    3
1201    1
1293    3
1467    2
1500    2
1536    1
1599    1
1653    1
Name: rating, dtype: int64


### Passing a function

In [14]:
average_ratings = grouped_data.apply(lambda f: f.mean())
average_ratings.head()

movie_id
1    3.878319
2    3.206107
3    3.033333
4    3.550239
5    3.302326
Name: rating, dtype: float64

In [15]:
# get the average rating per user
grouped_data = ratings['rating'].groupby(ratings['user_id'])
average_ratings = grouped_data.mean()
average_ratings.head()

user_id
1    3.610294
2    3.709677
3    2.796296
4    4.333333
5    2.874286
Name: rating, dtype: float64

In [16]:
# list all occupations and if they are male or female dominant
grouped_data = users['sex'].groupby(users['occupation'])
male_dominant_occupations = grouped_data.apply(lambda f: sum(f == 'M') > sum(f == 'F'))

print(male_dominant_occupations)
print('\n')

occupation
administrator     True
artist            True
doctor            True
educator          True
engineer          True
entertainment     True
executive         True
healthcare       False
homemaker        False
lawyer            True
librarian        False
marketing         True
none              True
other             True
programmer        True
retired           True
salesman          True
scientist         True
student           True
technician        True
writer            True
Name: sex, dtype: bool




In [17]:
print("number of male users:")
print(sum(users['sex'] == 'M'))

print("number of female users")
print(sum(users['sex'] == 'F'))

number of male users:
670
number of female users
273


### Python data scraping
Why scrape the web?
- vast source of information
- automate tasks
- keep up with sites
Some examples:
- stock market monitoring
- sports data
- airline prices
- amazon, twitter, indeed, rikunabi, mynabi, etc...

Copyrights and permission:
- be careful and polite
- give credit 
- care about media law
- no spam, overloading sites, etc

### Robots.txt
- specified by web site owner
- gives instructions to web robots(your script)
- is located at the top-level directory of the web server

For example:

[http://www.example.com/robots.txt](http://www.example.com/robots.txt)

[http://google.com/robots.txt](http://google.com/robots.txt)

### Scraping with Python
- scraping is all about HTML tags
- bad news:
    - need to learn about tags
    - websites can be very ugly

### HTML
- HyperTextMarkup Language
- standard for creating webpages
- HTML tags
    - have angle barckets
    - typically come in pairs



In [18]:
htmlString = """
<html>
    <head>
        <title>This is a title</title>
    </head>
    <body>
        <h2>Test</h2>
        <p>Hello world!</p>
    </body>
</html>"""

htmlOutput = HTML(htmlString)
htmlOutput

### Useful tags
- heading `<h1></h1> ... <h6></h6>`
- paragraph `<p></p>`
- line break `<br>`
- link with attribute `<a href="http://www.example.com/">An example link</a>`

### Scraping with Python
- example of a beautiful simple webpage:
[http://www.crummy.com/software/BeautifulSoup](http://www.crummy.com/software/BeautifulSoup)

- different useful libraries:
    - urllib
    - beautifulsoup 
    - pattern
    - soupy 
    - LXML
    - ...

The following cell defines a url as a string and then reads the data from that url using the `urllib` library.

In [19]:
url = 'http://www.crummy.com/software/BeautifulSoup'
source = urllib2.urlopen(url).read().decode('utf-8')

In python 3, 'strings' are by default 'bytecodes' not 'unicode'.

This means that the string is a byte string and not unicode.

If we print we see that we got the whole HTML content of the page into the string variable `source`.

In [20]:
print(source)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/transitional.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
<link rev="made" href="mailto:leonardr@segfault.org">
<link rel="stylesheet" type="text/css" href="/nb/themes/Default/nb.css">
<meta name="Description" content="Beautiful Soup: a library designed for screen-scraping HTML and XML.">
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
<meta name="author" content="Leonard Richardson">
</head>
<body bgcolor="white" text="black" link="blue" vlink="660066" alink="red">
<style>
#tidelift { }

#tidelift a {
 border: 1px solid #666666;
 margin-left: auto;
 padding: 10px;
 text-decoration: none;
}

#tidelift .cta {
 background: url("tidelift.svg") no-repeat;
 padding-left: 30px;
}
</style>		   

<img align="right" src="10.1.jpg" width="250"><br />

<p>[

### Questions:
- Is the word 'Alice' mentioned on the beautiful soup homepage?
- How often does the word 'Soup' occur on the site?
    - hint: use `.count()`
- At what index occurs the substring 'alien video games'?
    - hint: use `.find()`

In [21]:
# is 'Alice' in source?
print('Alice' in source)

False


In [22]:
# count occurences of 'Soup'
print(source.count('Soup'))

50


In [23]:
# find index of 'alien video games'
position = source.find('alien video games')
print(position)

-1


In [24]:
# find index of 'alien video games'
position = source.find('Beautiful Soup')
print(position)

203


In [25]:
# quick test to see the substring in the source variable
# you can access strings like lists
print(source[position:position+20])

Beautiful Soup: We c


In [26]:
# tidier version
print(source[position:position+len('Beautiful Soup')])

Beautiful Soup


## Beautiful Soup
- designed to make your life easier
- many good functions for parsing html code

## Some examples

In [27]:
# get bs4 object
soup = bs4.BeautifulSoup(source)

In [28]:
# compare the two print statements
print(soup)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/transitional.dtd">

<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
<link href="mailto:leonardr@segfault.org" rev="made"/>
<link href="/nb/themes/Default/nb.css" rel="stylesheet" type="text/css"/>
<meta content="Beautiful Soup: a library designed for screen-scraping HTML and XML." name="Description"/>
<meta content="Markov Approximation 1.4 (module: leonardr)" name="generator"/>
<meta content="Leonard Richardson" name="author"/>
</head>
<body alink="red" bgcolor="white" link="blue" text="black" vlink="660066">
<style>
#tidelift { }

#tidelift a {
 border: 1px solid #666666;
 margin-left: auto;
 padding: 10px;
 text-decoration: none;
}

#tidelift .cta {
 background: url("tidelift.svg") no-repeat;
 padding-left: 30px;
}
</style>
<img align="right" src="10.1.jpg" width="250"/><br/>
<p>[

In [29]:
print(soup.prettify())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/transitional.dtd">
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Beautiful Soup: We called him Tortoise because he taught us.
  </title>
  <link href="mailto:leonardr@segfault.org" rev="made"/>
  <link href="/nb/themes/Default/nb.css" rel="stylesheet" type="text/css"/>
  <meta content="Beautiful Soup: a library designed for screen-scraping HTML and XML." name="Description"/>
  <meta content="Markov Approximation 1.4 (module: leonardr)" name="generator"/>
  <meta content="Leonard Richardson" name="author"/>
 </head>
 <body alink="red" bgcolor="white" link="blue" text="black" vlink="660066">
  <style>
   #tidelift { }

#tidelift a {
 border: 1px solid #666666;
 margin-left: auto;
 padding: 10px;
 text-decoration: none;
}

#tidelift .cta {
 background: url("tidelift.svg") no-repeat;
 padding-left: 30px;
}
  </style>
  <img align="right" src="1

In [30]:
# show how to find all 'a' tags
soup.findAll('a')

[<a href="#Download">Download</a>,
 <a href="bs4/doc/">Documentation</a>,
 <a href="#HallOfFame">Hall of Fame</a>,
 <a href="enterprise.html">For enterprise</a>,
 <a href="https://code.launchpad.net/beautifulsoup">Source</a>,
 <a href="https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG">Changelog</a>,
 <a href="https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup">Discussion group</a>,
 <a href="zine/">Zine</a>,
 <a href="bs4/download/"><h1>Beautiful Soup</h1></a>,
 <a href="http://lxml.de/">lxml</a>,
 <a href="http://code.google.com/p/html5lib/">html5lib</a>,
 <a href="bs4/doc/">Read more.</a>,
 <a href="https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&amp;utm_medium=referral&amp;utm_campaign=enterprise" target="_blank">
 <span class="cta">
   Beautiful Soup for enterprise available via Tidelift
  </span>
 </a>,
 <a href="https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup">the discussion
 gr

Why does this not work?

```python
soup.findAll('Soup')
```

## Another examples

In [31]:
# get attribute value from an element
# find tag: this only returns the first occurence, not all tags in the string
first_tag = soup.find('a')
print(first_tag)

<a href="#Download">Download</a>


In [32]:
# get attribute 'href'
first_tag.get('href')

'#Download'

In [33]:
# get all links in the page
link_list = [l.get('href') for l in soup.findAll('a')]
print(link_list)

['#Download', 'bs4/doc/', '#HallOfFame', 'enterprise.html', 'https://code.launchpad.net/beautifulsoup', 'https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG', 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup', 'zine/', 'bs4/download/', 'http://lxml.de/', 'http://code.google.com/p/html5lib/', 'bs4/doc/', 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=enterprise', 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup', 'https://bugs.launchpad.net/beautifulsoup/', 'https://tidelift.com/security', 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website', 'zine/', None, 'bs4/download/', 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html', 'download/3.x/BeautifulSoup-3.2.2.tar.gz', 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup?utm_source=pypi-beautifulsoup&u

In [34]:
# filter all external links
# create an empty list to collect the valid links
external_links = []

# write a loop to filter the links 
# if it starts with 'http', it is ok
for l in link_list:
    if l[:4] == 'http':
        external_links.append(l)

# this throws an error! it says something about 'NoneType'

TypeError: 'NoneType' object is not subscriptable

In [35]:
# let's investigate. Have a close look at the link_list
link_list

['#Download',
 'bs4/doc/',
 '#HallOfFame',
 'enterprise.html',
 'https://code.launchpad.net/beautifulsoup',
 'https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'zine/',
 'bs4/download/',
 'http://lxml.de/',
 'http://code.google.com/p/html5lib/',
 'bs4/doc/',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=enterprise',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'https://bugs.launchpad.net/beautifulsoup/',
 'https://tidelift.com/security',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website',
 'zine/',
 None,
 'bs4/download/',
 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html',
 'download/3.x/BeautifulSoup-3.2.2.tar.gz',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup?utm_sourc

In [36]:
# seems that there are None elements!
# let's verify 
print(sum([l is None for l in link_list]))

2


In [37]:
# let's filter those objects out in the for loop
external_links = []

# write a loop to filter the links 
# if it is not None ans starts with 'http' it is ok
for l in link_list:
    if l is not None and l[:4] == 'http':
        external_links.append(l)

external_links

['https://code.launchpad.net/beautifulsoup',
 'https://bazaar.launchpad.net/%7Eleonardr/beautifulsoup/bs4/view/head:/CHANGELOG',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'http://lxml.de/',
 'http://code.google.com/p/html5lib/',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=enterprise',
 'https://groups.google.com/forum/?fromgroups#!forum/beautifulsoup',
 'https://bugs.launchpad.net/beautifulsoup/',
 'https://tidelift.com/security',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=website',
 'http://www.crummy.com/software/BeautifulSoup/bs3/documentation.html',
 'https://tidelift.com/subscription/pkg/pypi-beautifulsoup?utm_source=pypi-beautifulsoup&utm_medium=referral&utm_campaign=website',
 'http://www.nytimes.com/2007/10/25/arts/design/25vide.html',
 'https://github.com/BlankerL/DXY-COVID-19-Crawler',
 'ht

**Note**: The above `if` condition works because of lazy evaluation in Python. The `and` statement becomes `False` if the first part is `False`, so there is no need to ever evaluate the second part. Thus a `None` entry in the list gets never asked about its first four characters.

## Parsing the tree

In [38]:
# redifining 's' without any line breaks
s = """<!DOCTYPE html><html><head><title>This is a title!</title></head><body><h3>Test!</h3><p>Hello World!</p></body></html>"""

In [39]:
# get bs4 object
tree = bs4.BeautifulSoup(s)

# get html root node 
root_node = tree.html

# get head from root using contents 
head = root_node.contents[0]

# get body from root 
body = root_node.contents[1]

# could directly access body
print(tree.head)
print(tree.body)

<head><title>This is a title!</title></head>
<body><h3>Test!</h3><p>Hello World!</p></body>


## Questions
- find the h3 tag by parsing the tree starting at body
- create a list of all **Hall of Fame** entries listed on the BeautifulSoup webpage 
    - hint: it is the only unordered list in the page (tag ul)

In [40]:
# get h3 tag from body
body.contents[0]

<h3>Test!</h3>

In [41]:
# use ul as entry point 
entry_point = soup.find('ul')
print(entry_point)

<ul>
<li><a href="http://www.nytimes.com/2007/10/25/arts/design/25vide.html">"Movable
 Type"</a>, a work of digital art on display in the lobby of the New
 York Times building, uses Beautiful Soup to scrape news feeds.

<li>Jiabao Lin's <a href="https://github.com/BlankerL/DXY-COVID-19-Crawler">DXY-COVID-19-Crawler</a>
uses Beautiful Soup to scrape a Chinese medical site for information
about COVID-19, making it easier for researchers to track the spread
of the virus. (Source: <a href="https://blog.tidelift.com/how-open-source-software-is-fighting-covid-19">"How open source software is fighting COVID-19"</a>)

<li>Reddit uses Beautiful Soup to <a href="https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py">parse
a page that's been linked to and find a representative image</a>.

<li>Alexander Harrowell uses Beautiful Soup to <a href="http://www.harrowell.org.uk/viktormap.html">track the business
 activities</a> of an arms merchant.

<li>The de

In [42]:
# get hall of fame list from entry point 
# skip the first entry
hall_of_fame_list = entry_point.contents[1:]
print(hall_of_fame_list)

[<li><a href="http://www.nytimes.com/2007/10/25/arts/design/25vide.html">"Movable
 Type"</a>, a work of digital art on display in the lobby of the New
 York Times building, uses Beautiful Soup to scrape news feeds.

<li>Jiabao Lin's <a href="https://github.com/BlankerL/DXY-COVID-19-Crawler">DXY-COVID-19-Crawler</a>
uses Beautiful Soup to scrape a Chinese medical site for information
about COVID-19, making it easier for researchers to track the spread
of the virus. (Source: <a href="https://blog.tidelift.com/how-open-source-software-is-fighting-covid-19">"How open source software is fighting COVID-19"</a>)

<li>Reddit uses Beautiful Soup to <a href="https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py">parse
a page that's been linked to and find a representative image</a>.

<li>Alexander Harrowell uses Beautiful Soup to <a href="http://www.harrowell.org.uk/viktormap.html">track the business
 activities</a> of an arms merchant.

<li>The develo

In [43]:
# reformat intro a list containing strings
tmp = []
for li in hall_of_fame_list:
    tmp.append(li.contents)

tmp now is actually a list of lists, containing the hall of fame entries. Here are some advanced Python on how to print really just one entry per list item:

The cool things about this are:
- the use of "" to just access the `join` function of strings
- the `join` function itself
- that you can actually have 2 nested `for` loops in a list comprehension 

In [44]:
test = ["".join(str(a) for a in sublist) for sublist in tmp]
print('\n'.join(test))

<a href="http://www.nytimes.com/2007/10/25/arts/design/25vide.html">"Movable
 Type"</a>, a work of digital art on display in the lobby of the New
 York Times building, uses Beautiful Soup to scrape news feeds.

<li>Jiabao Lin's <a href="https://github.com/BlankerL/DXY-COVID-19-Crawler">DXY-COVID-19-Crawler</a>
uses Beautiful Soup to scrape a Chinese medical site for information
about COVID-19, making it easier for researchers to track the spread
of the virus. (Source: <a href="https://blog.tidelift.com/how-open-source-software-is-fighting-covid-19">"How open source software is fighting COVID-19"</a>)

<li>Reddit uses Beautiful Soup to <a href="https://github.com/reddit/reddit/blob/85f9cff3e2ab9bb8f19b96acd8da4ebacc079f04/r2/r2/lib/media.py">parse
a page that's been linked to and find a representative image</a>.

<li>Alexander Harrowell uses Beautiful Soup to <a href="http://www.harrowell.org.uk/viktormap.html">track the business
 activities</a> of an arms merchant.

<li>The developers 

## Advanced Example

We want to scrape the information from advertisements for data scientists from 'indeed.com'. Let's go!

In [45]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd 
import time

In [46]:
URL = 'https://jp.indeed.com/jobs?q=%E3%83%87%E3%83%BC%E3%82%BF%E3%82%B5%E3%82%A4%E3%82%A8%E3%83%B3%E3%83%86%E3%82%A3%E3%82%B9%E3%83%88'

# conducting a request of the stated URL above
page = requests.get(URL)

In [47]:
# specifying a desired format pf 'page' using
# the html parser. This allows Python to read
# the various components of the page, rather 
# than treating it as one long string
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())

ited{color:#fff}.icl-Button--primary:active,.icl-Button--primary:focus,.icl-Button--primary:hover,.icl-Button:active,.icl-Button:focus,.icl-Button:hover{color:#fff;text-decoration:none}[dir] .icl-Button--primary:active,[dir] .icl-Button--primary:focus,[dir] .icl-Button--primary:hover,[dir] .icl-Button:active,[dir] .icl-Button:focus,[dir] .icl-Button:hover{background-color:#1497ff;border-color:#1497ff}.icl-Button--primary:focus,.icl-Button:focus{-webkit-box-shadow:0 0 0 2px #fff,0 0 0 4px #085ff7;outline:0}[dir] .icl-Button--primary:focus,[dir] .icl-Button:focus{box-shadow:0 0 0 2px #fff,0 0 0 4px #085ff7}.icl-Button--secondary{box-sizing:border-box;display:inline-block;color:#085ff7;-webkit-font-smoothing:antialiased;overflow:hidden;text-overflow:ellipsis;text-decoration:none;white-space:nowrap;-webkit-highlight:none;-webkit-tap-highlight-color:rgba(0,0,0,0);-webkit-touch-callout:none;-webkit-appearance:none;-moz-appearance:none;appearance:none;-webkit-user-select:none;-moz-user-select

### Withdrawing basic elements of data

In [48]:
def extract_job_title_from_result(soup):
    jobs = []
    
    for div in soup.find_all(name='div', attrs={"class":"row"}): 
        for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
            jobs.append(a["title"])
    return(jobs)

extract_job_title_from_result(soup)

['データサイエンティスト / Data Labs',
 'データサイエンティスト(2022年度新卒採用)\u3000',
 'データサイエンティスト',
 'データサイエンティスト / データ分析 / データ・ドリブン施策提案',
 '研究開発分野 募集ポジション',
 'データサイエンティスト・データアナリスト',
 'データサイエンティスト(データベース・スペシャリスト) / Data Scientist (Database Specialist)',
 'データサイエンティスト/データアナリスト(データ分析)',
 'データサイエンティスト',
 'データサイエンティスト',
 'AI/データ領域のソリューション営業',
 'データサイエンティスト',
 'データサイエンティスト',
 'データサイエンティスト\xa0',
 'データサイエンティスト、データ分析官(キャリアチェンジ)',
 '未経験歓迎lデータサイエンティスト']

Perfect! All 16 jobs are listed here.

### Company name
Company names are a bit tricky, as most would appear in `<span>` tags, with "class":"company". Rarely, however they will be housed in `<span>` tags with "class":"result-link-source".

The `if/else` statement hepls to extract the company info either of these places. Comapny names are output with a lot of white space around them, so inputting `.strip()` at the end helps to remove this when extracting the info.

In [49]:
def extract_company_from_result(soup):
    companies = []
    for div in soup.find_all(name="div", attrs={"class":"row"}):
        company = div.find_all(name="span", attrs={"class":"company"})
        if len(company) > 0:
            for b in company:
                companies.append(b.text.strip())
        else:
            sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
            for span in sec_try:
                companies.append(span.text.strip())
    return(companies)

extract_company_from_result(soup)

['LINE株式会社',
 'パーソルキャリア株式会社',
 '株式会社ブレインパッド',
 '株式会社レッジ',
 '株式会社資生堂',
 'L2株式会社',
 '中外製薬グループ',
 'コグラフ株式会社',
 '株式会社ジーユーエヌ',
 '社会福祉法人善光会',
 '株式会社エイアイ・フィールド(AIフィールド)',
 '合同会社DILIGENCE',
 'キヤノンITソリューションズ株式会社',
 '株式会社aiforce solutions',
 '株式会社エイアイ・フィールド(AIフィールド)',
 '株式会社エイアイ・フィールド(AIフィールド)']

### Location
Locations are located unfer the `<span>` tags. Span tags are sometimes nested within each other, such that the location text may sometimes be within "class":"location" attributes, or nested in "itemprop":""addressLocality".

In [50]:
def extract_location_from_result(soup):
    locations = []
    spans = soup.findAll('span', attrs={'class': 'location'})
    divs = soup.findAll('div', attrs={'class': 'location'})
    
    for div in divs:
        locations.append(div.text)

    for span in spans:
        locations.append(span.text)
    
    return(locations)

extract_location_from_result(soup)

['新宿区 新宿',
 '千代田区 大手町',
 '港区 白金台',
 '品川区 五反田駅',
 '千代田区',
 '千代田区',
 '横浜市 西区',
 '港区 表参道駅',
 '中央区',
 '品川区 五反田駅',
 '新宿区 新宿',
 '港区',
 '品川区 五反田駅',
 '渋谷区 神宮前',
 '港区 三田',
 '千代田区 大手町']

### Salary
It is difficult to extract 'Salary' info from job postings. Most postings don't contain any salary info at all.

Some salaries are housed under the `<nobr>` tags, while others are under `<div>` tags, "class":"sjcl" and are under seperate nested `<div>` tags with no attributes. `try/except` statements aree particularly helpful in withdrawing this info:

In [51]:
def extract_salary_from_result(soup):
    salaries = []
    spans = soup.findAll('span', attrs={'class':'salaryText'})
    for span in spans:
        salaries.append(span.text.strip())
    return(salaries)

extract_salary_from_result(soup)

['年収 350万 ~ 450万円',
 '月給 34万 ~ 84万円',
 '月給 42万 ~ 84万円',
 '月給 25万 ~ 60万円',
 '年収 350万 ~ 800万円',
 '月給 27万 ~ 80万円',
 '年収 400万 ~ 700万円',
 '年収 350万 ~ 700万円']

### Summary

In [52]:
def extract_summary_from_result(soup):
    summaries = []
    divs = soup.findAll('div', attrs={'class':'summary'})
    for div in divs:
        summaries.append(div.text.replace('\n', ''))
    return(summaries)

extract_summary_from_result(soup)

['や利用動向などのデータを横断的に処理し、 より精密なデータ分析と情報フィルタリングを提供し、全サービスのデータの効率的な... 価値の最大化をデータ分析によって実現できるデータ分析者を募集...',
 'データツールの開発 * 分析プラットフォーム開発 データ... 関連キーワード: データアナリスト・データサイエンティスト・データ分析エンジニア・ビッグデータ解析データスペシャリスト...',
 '【職種名】 データサイエンティスト 【勤務地】 白金台... ます。 深層学習や機械学習関連の文献調査・実装を行い、データサイエンティストや コンサルタントと連携しながら、ソリューシ...',
 'データ・ドリブン施策提案などの業務をお願いいたします。 ・Webログデータ、事業データのデータ分析業務 ・データ/ドリブ... 経験 ・データアナリスト、データサイエンティスト、データエン...',
 '心理生理学研究 • 美容機器開発 • 化粧品容器・用具開発 • AI技術者 • データサイエンティスト • Smartphone向けアプリエンジニア 化粧品の安全性評価 全身毒性...',
 '析に必要なデータ集計及び最適なツールの運用 ・広告効果や位置情報などのデータ解析や仮説立案 ・データ解析を軸としたシステ... データ分析、データマイニング業務経験 ・顧客情報のデータ管理...',
 '提供する。 ・データベース上にあるデータに対して、解析技術者とともに必要なデータセットを定義し、解析技術者が必要なデータ... 扱うデータもオミクスデータ・製造データ、リアルワールドデータ...',
 '日時など分析に必要なデータを確認） ③データ抽出作業（SQL... 伴い、データサイエンティスト／データアナリストとして 一緒に会社を成長させていく仲間を募集しています。 配属部署 データ...',
 'データサイエンティスト 職務内容 データの分析業務 求めるスキル データ分析会社、シンクタンク、事業会社でのデータ分析業務経験 データ分析関連知識 Python、R、SQL、SAS...',
 '【選考方法】 書類選考→面接(1~2回)',
 'リューション営業/AI/データ/機械学習/データ分析/DMP/提案営業 一言 営業職の募集です。 AI・データサイエンス... 案内容：AI