# Web Scraping: EveryNoise.com

source: [http://everynoise.com/everynoise1d.cgi?scope=all]



In [1]:
from bs4 import BeautifulSoup
import requests, sys
from collections import defaultdict
from multiprocessing.pool import ThreadPool
from tqdm import tqdm
import json

In [2]:
## Request HTML

r = requests.get('http://everynoise.com/everynoise1d.cgi?vector=popularity&scope=all')

In [3]:
## Parse Genre HTML Elements
soup = BeautifulSoup(r.text, "html.parser")
table = soup.body.table
elements = table.find_all('td', 'note')
genres = []
for i, row in enumerate(elements):
    if i % 2:
        genres.append(row.contents[0].contents[0])
print("Total genres:", len(genres))
print("First ten genres:\n", genres[:10])

Total genres: 5071
First ten genres:
 ['pop', 'dance pop', 'rap', 'pop dance', 'pop rap', 'rock', 'post-teen pop', 'latin', 'hip hop', 'trap']


In [4]:
## Request Playlist HTMLs and extract link

def requestPlaylists(genres, links, threadName):
    spotifyLinks = dict()
    with tqdm(total=len(genres), position=1, desc=threadName) as pbar:
        for i in range(len(genres)):
            playlist_r = requests.get('http://everynoise.com/everynoise1d.cgi?root={}&scope=all'.format(genres[i]))
            soup = BeautifulSoup(playlist_r.text, "html.parser")
            playlistLink = soup.find_all(id='spotifylink')
            spotifyLinks[genres[i]] = playlistLink[0]['href']
            if(i > 0 and (i+1) % 50 == 0):
                pbar.update(50)
            elif(i == len(genres) - 1):
                pbar.update(len(genres) % 50)
    links.update(spotifyLinks)
    print(threadName, ' completed.')

In [5]:
## Split genres into chunks

chunk_indices = []

for i in range(0, len(genres) - len(genres) % 1000, 1000):
    chunk_indices.append((i, i + 1000))

chunk_indices.append((len(genres) - len(genres) % 1000, len(genres)))


In [6]:
## Create a thread for each chunk to fetch links

links = dict()

pool = ThreadPool(len(chunk_indices))

for t in range(len(chunk_indices)):
    pool.apply_async(requestPlaylists, args=(genres[chunk_indices[t][0]: chunk_indices[t][1]], links, "Thread {}".format(t+1)))

pool.close()
pool.join()

## Been trying to make each progress bar stay on its line


Thread 2:   0%|          | 0/1000 [00:00<?, ?it/s][A
Thread 1:   0%|          | 0/1000 [00:00<?, ?it/s][A
Thread 3:   0%|          | 0/1000 [00:00<?, ?it/s][A
Thread 6:   0%|          | 0/71 [00:00<?, ?it/s][A
Thread 4:   0%|          | 0/1000 [00:00<?, ?it/s][A
Thread 5:   0%|          | 0/1000 [00:00<?, ?it/s][A
Thread 3:   5%|▌         | 50/1000 [06:35<2:05:13,  7.91s/it][A
Thread 4:   5%|▌         | 50/1000 [06:52<2:10:38,  8.25s/it][A
Thread 5:   5%|▌         | 50/1000 [07:10<2:16:26,  8.62s/it][A
Thread 6:  70%|███████   | 50/71 [07:11<03:01,  8.63s/it][A
Thread 2:   5%|▌         | 50/1000 [07:14<2:17:28,  8.68s/it][A
Thread 1:   5%|▌         | 50/1000 [07:26<2:21:16,  8.92s/it][A
Thread 6: 100%|██████████| 71/71 [10:13<00:00,  8.64s/it]
Thread 6  completed.

Thread 3:  10%|█         | 100/1000 [12:53<1:57:06,  7.81s/it][A
Thread 4:  10%|█         | 100/1000 [13:14<2:00:59,  8.07s/it][A
Thread 5:  10%|█         | 100/1000 [13:33<2:04:54,  8.33s/it][A
Thread 2:  10

In [7]:
print(len(links))

5071


In [10]:
with open('../data/spotify_links.json', 'w') as outfile:
    json.dump(links, outfile, indent=4)