In [1]:
import pandas as pd
import numpy as np
import sqlite3 as sql
import requests
import json
import re
import pprint
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time
import os
from bs4 import BeautifulSoup as bs
from datetime import datetime

pp = pprint.PrettyPrinter()

In [97]:
def get_review_info(review_id, review_link, http):
    """
    some text
    """
    df = pd.DataFrame([])
    
    #pattern = r"(?<=albums/)([0-9]*)" #thought that was review id but it seems like they got rid of it... unfortunate
    #m = re.search(pattern, review_link)[0]

    response = http.get(review_link)
    soup = bs(response.text, "html.parser")
    
    df.loc[review_id, "album"] = soup.find("h1", class_="single-album-tombstone__review-title").text
    df.loc[review_id, "artist"] = ", ".join([a.text for a in soup.find("ul", class_="single-album-tombstone__artist-links")])
    df.loc[review_id, "score"] = float(soup.find("span", class_="score").contents[0])
    df.loc[review_id, "author"] = soup.find("a", class_="authors-detail__display-name").text
    if soup.find("span", class_="authors-detail__title") != None:
        df.loc[review_id, "author_type"] = soup.find("span", class_="authors-detail__title").text
    df.loc[review_id, "genre"] = ", ".join([a.text for a in soup.find_all("a", class_="genre-list__link")])
    df.loc[review_id, "review_date"] = datetime.strptime(soup.find("time", class_="pub-date")["datetime"], "%Y-%m-%dT%H:%M:%S")
    if soup.find("li", class_="labels-list__item") != None:
        df.loc[review_id, "label"] = soup.find("li", class_="labels-list__item").contents
    df.loc[review_id, "release_year"] = soup.find("span", "single-album-tombstone__meta-year").contents[-1]
    if soup.find("p", "bnm-txt") != None:
        df.loc[review_id, "special_label"] = soup.find("p", "bnm-txt").text
    df.loc[review_id, "content"] = "\n".join([a.text for a in soup.find("div", class_="review-detail__article-content").find_all("p")])
    df.loc[review_id, "url"] = review_link
    return df
    
def find_albums(url, http):
    """
    some text
    """
    reviews = []
    response = http.get(url)
    soup = bs(response.text, "html.parser")
    spoon = soup.find_all("a", class_="review__link")
    for element in spoon:
        review_link = "https://pitchfork.com" + element["href"]  
        reviews.append(review_link)
    return reviews

def scrape_pitchfork(verbose=True, number_of_pages = 5):
    """
    some text
    """
    
    time_before = time.time()
    
    
    #### setup    
    base_url = "https://pitchfork.com/reviews/albums/?page="
        
    #### retry strategy
    retry_strategy = Retry(
        total=3,
        status_forcelist=[429, 500, 502, 503, 504],
        method_whitelist=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)
    
    #### actual implementation
    df = pd.DataFrame([])
    reviews = []
    review_id=1
    for i in range(1, number_of_pages+1):
        url = base_url + str(i)
        review_links = find_albums(url, http)
        for link in review_links:
            df = df.append(get_review_info(review_id, link, http))
            review_id += 1
    
            if verbose:
                print (f"working on page number {i} of {number_of_pages} dealing with review number {review_id}", end="\r")

        if i%50 == 0:
            ### Save at every 50th step because sometimes the algorithm breaks
            df.to_csv(f'1 to {i}.csv')
            print(f"last album was {review_links[-1]}")
            
    #### timer for final print
    elapsed_time = time.time() - time_before
    print(f"this approach took {elapsed_time} seconds for a total of {review_id} reviews")
    
    return df


The last page on pitchfork as og August 19th 2020 is 1900.

That's about 22800 reviews and can take up to 6/7 hours

In [98]:
df = scrape_pitchfork(number_of_pages = 1900)

last album was https://pitchfork.com/reviews/albums/449-automatic-writing/
last album was https://pitchfork.com/reviews/albums/8519-lovers-lead-the-way/
last album was https://pitchfork.com/reviews/albums/967-more-nipples/
last album was https://pitchfork.com/reviews/albums/4631-light-magic/
last album was https://pitchfork.com/reviews/albums/4447-versus/1599
last album was https://pitchfork.com/reviews/albums/3702-neil-michael-hagerty/
last album was https://pitchfork.com/reviews/albums/3723-left-for-dead-in-malaysia/
this approach took 4428.469939947128 seconds for a total of 22788 reviews


In [None]:
#df.to_csv(f'complete list.csv')