# Intro to Webscraping with Python

## Load Packages

In [2]:
import requests
import os
from bs4 import BeautifulSoup
import json #, jsonlines
import pandas as pd
import re

os.getcwd()
ROOT_DIR = os.getcwd()
ROOT_DIR

'/Users/yxu6/Alego/CSSR-Workshop-Scrapy'

## Example 1
## Scrape Classical Quotes from GoodReads Page

In [3]:
url = 'https://www.goodreads.com/quotes/tag/classic-literature'
response = requests.get(url)

In [None]:
## Example 2
## Examine HTTP Response

In [4]:
response.status_code

200

In [5]:
response.headers

{'Server': 'Server', 'Date': 'Thu, 15 Feb 2024 19:10:56 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'x-amz-rid': 'FBD32RBRR4S6FBMCZP27', 'Set-Cookie': 'ccsid=629-9055436-2402078; path=/; expires=Mon, 15 Feb 2044 19:10:56 -0000, locale=en; path=/, _session_id2=5187483e79d47e48ea21944ed80b0bec; path=/; expires=Fri, 16 Feb 2024 01:10:56 -0000; HttpOnly', 'X-Frame-Options': 'SAMEORIGIN', 'X-XSS-Protection': '1; mode=block', 'X-Content-Type-Options': 'nosniff, nosniff', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains; preload', 'ETag': 'W/"2401fa3a160544496b4ff08d8829412c"', 'Cache-Control': 'max-age=0, private, must-revalidate', 'X-Request-Id': 'FBD32RBRR4S6FBMCZP27', 'X-Runtime': '0.263834', 'Content-Encoding': 'gzip', 'Vary': 'User-Agent,Content-Type,Accept-Encoding,User-Agent'}

In [6]:
response.text

'<!DOCTYPE html>\n<html class="desktop withSiteHeaderTopFullImage\n">\n<head>\n  <title>Classic Literature Quotes (174 quotes)</title>\n\n<meta content=\'174 quotes have been tagged as classic-literature: Yukio Mishima: ‘Young people get the foolish idea that what is new for them must be new for everybody ...\' name=\'description\'>\n<meta content=\'telephone=no\' name=\'format-detection\'>\n<link href=\'https://www.goodreads.com/quotes/tag/classic-literature\' rel=\'canonical\'>\n\n\n\n    <script type="text/javascript"> var ue_t0=window.ue_t0||+new Date();\n </script>\n  <script type="text/javascript">\n    var ue_mid = "A1PQBFHBHS6YH1";\n    var ue_sn = "www.goodreads.com";\n    var ue_furl = "fls-na.amazon.com";\n    var ue_sid = "629-9055436-2402078";\n    var ue_id = "FBD32RBRR4S6FBMCZP27";\n\n    (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.pu

In [7]:
response.url

'https://www.goodreads.com/quotes/tag/classic-literature'

## Parsing html with BeautifuSoup

In [8]:
# option1:
all_text = BeautifulSoup(response.text, 'lxml')
# option2:
all_text = BeautifulSoup(response.text, 'html.parser')
all_text = "".join(all_text.get_text()).strip()
all_text

'Classic Literature Quotes (174 quotes)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHomeMy BooksBrowse ▾RecommendationsChoice AwardsGenresGiveawaysNew ReleasesListsExploreNews & InterviewsGenresArtBiographyBusinessChildren\'sChristianClassicsComicsCookbooksEbooksFantasyFictionGraphic NovelsHistorical FictionHistoryHorrorMemoirMusicMysteryNonfictionPoetryPsychologyRomanceScienceScience FictionSelf HelpSportsThrillerTravelYoung AdultMore GenresCommunity ▾GroupsQuotesAsk the AuthorSign InJoinSign upView profileProfileFriendsGroupsDiscussionsCommentsReading ChallengeKindle Notes & HighlightsQuotesFavorite genresFriends’ recommendationsAccount settingsHelpSign outHomeMy BooksBrowse ▾RecommendationsChoice AwardsGenresGiveawaysNew ReleasesListsExploreNews & InterviewsGenresArtBiographyBusinessChildren\'sChristianClassicsComicsCookbooksEbooksFantasyFictionGraphic NovelsHistorical FictionHistoryHorrorMemoirMusicMysteryNonfictionPoetryPsychologyRomanceScien

In [None]:
## Structural Parsing

In [9]:
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find("h1").text
title

'Classic Literature Quotes'

In [10]:
quote_elements = soup.find_all("div", class_='quoteText')
quote_elements

[<div class="quoteText">
       “Young people get the foolish idea that what is new for them must be new for everybody else too. No matter how unconventional they get, they're just repeating what others before them have done.”
     <br/>
   ―
   <span class="authorOrTitle">
     Yukio Mishima,
   </span>
 <span id="quote_book_link_62823">
 <a class="authorOrTitle" href="/work/quotes/1343497">After the Banquet</a>
 </span>
 </div>,
 <div class="quoteText">
       “You are a wonderful creation. You know more than you think you know, just as you know less than you want to know.”
     <br/>
   ―
   <span class="authorOrTitle">
     Oscar Wilde,
   </span>
 <span id="quote_book_link_5297">
 <a class="authorOrTitle" href="/work/quotes/1858012">The Picture of Dorian Gray</a>
 </span>
 </div>,
 <div class="quoteText">
       “LONDON. Michaelmas Term lately over, and the Lord Chancellor sitting in Lincoln’s Inn Hall. Implacable November weather. As much mud in the streets as if the waters had b

In [12]:
quote_text = [i.text for i in quote_elements]
quote_text
len(quote_text)

30

## Convert List to Dataframe

In [13]:
quote_df = pd.DataFrame(quote_text,columns=['text'])
quote_df

Unnamed: 0,text
0,\n “Young people get the foolish idea tha...
1,\n “You are a wonderful creation. You kno...
2,"\n “LONDON. Michaelmas Term lately over, ..."
3,"\n “There is one thing, Emma, which a man..."
4,"\n “When you reread a classic, you do not..."
5,\n “Seaward ho! Hang the treasure! It's t...
6,"\n “At the moment when her eyes closed, w..."
7,\n “To HelenI saw thee once-once only-yea...
8,"\n “Say, you told me you thought Les Mise..."
9,"\n “Once upon a time, they say, there was..."


In [None]:
## Clean Text

In [14]:
quote_df['text'] = quote_df.text.str.replace('\n','', regex=True).str.replace('\s+',' ', regex=True)
quote_df

Unnamed: 0,text
0,“Young people get the foolish idea that what ...
1,“You are a wonderful creation. You know more ...
2,"“LONDON. Michaelmas Term lately over, and the..."
3,"“There is one thing, Emma, which a man can al..."
4,"“When you reread a classic, you do not see mo..."
5,“Seaward ho! Hang the treasure! It's the glor...
6,"“At the moment when her eyes closed, when all..."
7,“To HelenI saw thee once-once only-years ago;...
8,"“Say, you told me you thought Les Miserables ..."
9,"“Once upon a time, they say, there was a girl..."


## Exercise
#### In the webpage of the example 1, each quote receives "likes" from users.
#### Try to adpat the code above and extract the "likes", save it as dataframe

In [16]:
quote_likes1 = soup.find_all("div", class_='quoteFooter')
quote_likes2 = [t.find("a", class_='smallText').text for t in quote_likes1]
pd.DataFrame(quote_likes2,columns=['like_counts'])

Unnamed: 0,like_counts
0,242 likes
1,176 likes
2,119 likes
3,100 likes
4,90 likes
5,69 likes
6,57 likes
7,53 likes
8,39 likes
9,39 likes


In [21]:
quote_df = quote_df.assign(like_counts=quote_likes2)
# quote_df = quote_df.assign(like_num = quote_df.like_counts.str.split(' '))
quote_df[['like_num','like_content']] = quote_df.like_counts.str.split(' ', expand=True)
quote_df

Unnamed: 0,text,like_counts,like_num,like_content
0,“Young people get the foolish idea that what ...,242 likes,242,likes
1,“You are a wonderful creation. You know more ...,176 likes,176,likes
2,"“LONDON. Michaelmas Term lately over, and the...",119 likes,119,likes
3,"“There is one thing, Emma, which a man can al...",100 likes,100,likes
4,"“When you reread a classic, you do not see mo...",90 likes,90,likes
5,“Seaward ho! Hang the treasure! It's the glor...,69 likes,69,likes
6,"“At the moment when her eyes closed, when all...",57 likes,57,likes
7,“To HelenI saw thee once-once only-years ago;...,53 likes,53,likes
8,"“Say, you told me you thought Les Miserables ...",39 likes,39,likes
9,"“Once upon a time, they say, there was a girl...",39 likes,39,likes


In [24]:
soup = BeautifulSoup(response.content, 'html.parser')
link_to_book = soup.find_all('a', class_='authorOrTitle')
links = [(x.get('href'), x.string) for x in link_to_book]
links_df = pd.DataFrame(links,columns=['url','book_name'])
links_df = links_df.assign(url='https://www.goodreads.com'+links_df.url)
links_df


Unnamed: 0,url,book_name
0,https://www.goodreads.com/work/quotes/1343497,After the Banquet
1,https://www.goodreads.com/work/quotes/1858012,The Picture of Dorian Gray
2,https://www.goodreads.com/work/quotes/2960365,Bleak House
3,https://www.goodreads.com/work/quotes/3360164,Emma
4,https://www.goodreads.com/work/quotes/3077988,Treasure Island
5,https://www.goodreads.com/work/quotes/3043569,The Hunchback of Notre-Dame
6,https://www.goodreads.com/work/quotes/10920,Betsy and Joe
7,https://www.goodreads.com/work/quotes/755652,Dear and Glorious Physician
8,https://www.goodreads.com/work/quotes/1296784,It Can't Happen Here
9,https://www.goodreads.com/work/quotes/288738,The Aeneid


## Scrape URLs inside the page

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
link_to_book = soup.find_all('a', class_='authorOrTitle')
links = [(x.get('href'), x.string) for x in link_to_book]
links_df = pd.DataFrame(links,columns=['url','book_name'])
links_df = links_df.assign(url='https://www.goodreads.com/'+links_df.url)

## Example2
## Download PDF File

In [12]:
url = "https://files.wri.org/d8/s3fs-public/2021-10/transformations-equitable-sustainable-cities.pdf?VersionId=eDW1GgIXYhwUmdNluUMul9LuMsns3MRX"

In [14]:
response = requests.get(url)
with open(ROOT_DIR+'/pdf_test.pdf', 'wb') as fd:
    fd.write(response.content)