## WebScraping II: the reckoning

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import math

### Requests

In [2]:
import requests

google = requests.get("https://developers.google.com")
print("Google:", google.status_code)

NBA = requests.post("https://api.sportsdata.io/api/nba/fantasy/json/CurrentSeason", data={})
print("NBA:", NBA.status_code)

rotten_tomato = requests.get("http://api.rottentomatoes.com/api/public/v1.0/lists/movies/box_office.json")
print("Rotten Tomatoes:", rotten_tomato.status_code)

Google: 200
NBA: 401
Rotten Tomatoes: 403


#### Status Codes


* 200: Everything went okay and the result has been returned (if any).
* 301: The server is redirecting you to a different endpoint. This can happen when a company switches domain names, or an endpoint name is changed.
* 400: The server thinks you made a bad request. This happens when you don’t send along the right data, among other things.
* 401: You are not properly authenticated.
* 403: The resource you’re trying to access is forbidden: you don’t have the right permissions to get it.
* 404: The resource you tried to access doesn't exist.
* 503: The server can't handle the request.


In [3]:
r = requests.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2021-01-01,&user_rating=6.5,&num_votes=100,')
r.status_code

200

In [4]:
r.headers

{'Content-Type': 'text/html;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'Server', 'Date': 'Thu, 07 Oct 2021 13:04:19 GMT', 'x-amz-rid': 'BQ4SXJXN8G9FXE2B76CA', 'Set-Cookie': 'uu=eyJpZCI6InV1NzhmZmUyMzg0NDZkNDFlY2EwZmMiLCJwcmVmZXJlbmNlcyI6eyJmaW5kX2luY2x1ZGVfYWR1bHQiOmZhbHNlfX0=; Domain=.imdb.com; Expires=Tue, 25-Oct-2089 16:18:26 GMT; Path=/; Secure, session-id=000-0000000-0000000; Domain=.imdb.com; Expires=Tue, 25-Oct-2089 16:18:26 GMT; Path=/; Secure, session-id-time=2264331858; Domain=.imdb.com; Expires=Tue, 25-Oct-2089 16:18:26 GMT; Path=/; Secure', 'X-Frame-Options': 'SAMEORIGIN', 'Content-Security-Policy': "frame-ancestors 'self' imdb.com *.imdb.com *.media-imdb.com withoutabox.com *.withoutabox.com amazon.com *.amazon.com amazon.co.uk *.amazon.co.uk amazon.de *.amazon.de translate.google.com images.google.com www.google.com www.google.co.uk search.aol.com bing.com www.bing.com", 'Content-Language': 'en-US', 'Strict-Transport-Security': '

In [5]:
r.cookies

<RequestsCookieJar[Cookie(version=0, name='session-id', value='000-0000000-0000000', port=None, port_specified=False, domain='.imdb.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=True, expires=3781095506, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name='session-id-time', value='2264331858', port=None, port_specified=False, domain='.imdb.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=True, expires=3781095506, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name='uu', value='eyJpZCI6InV1NzhmZmUyMzg0NDZkNDFlY2EwZmMiLCJwcmVmZXJlbmNlcyI6eyJmaW5kX2luY2x1ZGVfYWR1bHQiOmZhbHNlfX0=', port=None, port_specified=False, domain='.imdb.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=True, expires=3781095506, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False)]>

/search/title/?title_type=feature&release_date=2021-01-01,2021-12-31&user_rating=6.5,&num_votes=100,&start=151&ref_=adv_nxt

In [6]:
# firt getting the number of total results

soup = BeautifulSoup(r.content, 'html.parser')

soup.find('div', attrs={'class': 'desc'}).find('span').get_text().split()[-2]

'517'

In [7]:
total_results = int(soup.find('div', attrs={'class': 'desc'}).find('span').get_text().split()[-2])
total_results

517

In [8]:
starts = range(1, total_results, 50) # because, 50 items per page
list(starts)

[1, 51, 101, 151, 201, 251, 301, 351, 401, 451, 501]

In [9]:
# To get all the pages:

movies = []

for start in starts:
    try:
        r = requests.get(f'https://www.imdb.com/search/title/?title_type=feature&release_date=2021-01-01,2021-12-31&user_rating=6.5,&num_votes=100,&start={start}&ref_=adv_nxt')
        soup = BeautifulSoup(r.content, 'html.parser')
        movies += soup.find_all('div', attrs={'class': 'lister-item-content'})
    except:
        print(f'Error on page {start}.')
        continue
    
len(movies)


# To get all the pages:

movies = []

for start in starts:
    
    r = requests.get(f'https://www.imdb.com/search/title/?title_type=feature&release_date=2021-01-01,2021-12-31&user_rating=6.5,&num_votes=100,&start={start}&ref_=adv_nxt')
    
    if r.status_code == 200:
        soup = BeautifulSoup(r.content, 'html.parser')
        movies += soup.find_all('div', attrs={'class': 'lister-item-content'})
    else:
        print(f'Error on page {start}.')
        continue
    
len(movies)

516

In [10]:
movies[0]

<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt2382320/">No Time to Die</a>
<span class="lister-item-year text-muted unbold">(2021)</span>
</h3>
<p class="text-muted">
<span class="certificate">UA</span>
<span class="ghost">|</span>
<span class="runtime">163 min</span>
<span class="ghost">|</span>
<span class="genre">
Action, Adventure, Thriller            </span>
</p>
<div class="ratings-bar">
<div class="inline-block ratings-imdb-rating" data-value="7.6" name="ir">
<span class="global-sprite rating-star imdb-rating"></span>
<strong>7.6</strong>
</div>
<div class="inline-block ratings-user-rating">
<span class="userRatingValue" data-tconst="tt2382320" id="urv_tt2382320">
<span class="global-sprite rating-star no-rating"></span>
<span class="rate" data-no-rating="Rate this" data-value="0" name="ur">Rate this</span>
</span>
<div class="starBarWidget" id="sb_tt2382320">
<div class="rating r

In [13]:
titles = []
ratings = []
genres = []
runtimes = []
links = []

for movie in movies:
    titles.append(movie.find('h3').find('a').get_text())
    ratings.append(movie.find('strong').get_text())
    genres.append(movie.find('span', attrs={'class': 'genre'}).get_text(strip=True))
    links.append('http://www.imdb.com' + movie.find('h3', attrs={'class': 'lister-item-header'}).find('a').get('href'))
    if movie.find('span', attrs={'class': 'runtime'}) != None:
        runtimes.append(movie.find('span', attrs={'class': 'runtime'}).get_text())
        print(titles[-1])
    else:
        print(movie.find('span', attrs={'class': 'runtime'}))
        runtimes.append('Not informed.')

No Time to Die
Dune
Free Guy
The Many Saints of Newark
Venom: Let There Be Carnage
Shang-Chi and the Legend of the Ten Rings
The Last Duel
The Suicide Squad
Cruella
Halloween Kills
The Green Knight
My Little Pony: A New Generation
Titane
The French Dispatch
Black Widow
Jungle Cruise
Pig
Last Night in Soho
Wrath of Man
Copshop
The Eyes of Tammy Faye
The Tomorrow War
Nobody
Zack Snyder's Justice League
The Power of the Dog
CODA
Lamb
Shershaah
The Electrical Life of Louis Wain
Luca
Stillwater
Chehre
Raya and the Last Dragon
Benedetta
Respect
Bhoot Police
Toma
Old Henry
Vivo
The Medium
The Witcher: Nightmare of the Wolf
The Mauritanian
No Sudden Move
In the Heights
Love Story
Ich bin dein Mensch
Shiddat
Blue Bayou
Le bal des folles
Fear Street: 1978
The Last Letter from Your Lover
Belfast
Judas and the Black Messiah
Mass
Midnight
Boku no Hero Academia: World Heroes Mission
The Mitchells vs the Machines
Cherry
Together
The Dig
Verdens verste menneske
Fear Street: 1666
Oxygène
Babardeala cu 

AttributeError: 'NoneType' object has no attribute 'get_text'

In [18]:
x = min(len(titles), len(ratings), len(genres), len(runtimes))

In [19]:
dct = {'title': titles[:x], 'rating': ratings[:x], 'genre': genres[:x], 'runtime': runtimes[:x]}

df = pd.DataFrame.from_dict(dct)
df

Unnamed: 0,title,rating,genre,runtime
0,No Time to Die,7.6,"Action, Adventure, Thriller",163 min
1,Dune,8.4,"Action, Adventure, Drama",155 min
2,Free Guy,7.3,"Action, Adventure, Comedy",115 min
3,The Many Saints of Newark,6.5,"Crime, Drama",120 min
4,Venom: Let There Be Carnage,6.6,"Action, Adventure, Sci-Fi",97 min
...,...,...,...,...
333,Aarkkariyam,6.9,"Crime, Drama",126 min
334,Naandhi,8.3,"Crime, Drama, Thriller",146 min
335,Zombie Reddy,6.9,"Action, Comedy, Horror",130 min
336,Flight,8.3,"Action, Thriller",116 min


In [23]:
links[:1]

['http://www.imdb.com/title/tt2382320/']

In [None]:
<ul class="ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--inline ipc-metadata-list-item__list-content baseAlt" role="presentation"><li role="presentation" class="ipc-inline-list__item"><a class="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link" rel="" href="/name/nm1560977/?ref_=tt_ov_dr">Cary Joji Fukunaga</a></li></ul>

In [37]:
for link in links[:2]:
    r = requests.get(link)
    soup = BeautifulSoup(r.content, 'html.parser')
    soup1 = soup.find('li', attrs={'class':'ipc-metadata-list__item'})
    directors = [element.get_text() for element in soup1.find_all('a', attrs={'class':'ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link'})]
#     directors = [element.get_text() for element in soup.find_all('a', attrs={'class':'ipc-metadata-list__item'})]
#     directors = [element.get_text() for element in soup.find('div', attrs={'class': 'credit_summary_item'}).find_all('a')]
#     print(soup.find('a', attrs={'class':'ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link'}).get_text())
    print(directors)

['Cary Joji Fukunaga']
['Denis Villeneuve']


In [38]:
directors = []
for movie in movies[:9]:
    directors.append([element.strip().replace(',', '') for element in movie.find('p', attrs={'class':''}).get_text().strip().split('|')[0].split('\n')[1:-1]])

In [39]:
directors

[['Cary Joji Fukunaga'],
 ['Denis Villeneuve'],
 ['Shawn Levy'],
 ['Alan Taylor'],
 ['Andy Serkis'],
 ['Destin Daniel Cretton'],
 ['Ridley Scott'],
 ['James Gunn'],
 ['Craig Gillespie']]

In [40]:
movie.find('p', attrs={'class':''}).get_text().strip().split('|')[0].split('\n')[1:-1]

['Craig Gillespie']