In [5]:
url = "https://www.uk-charts.top-source.info/top-100-2000.shtml"

In [6]:
from bs4 import BeautifulSoup
import requests

In [7]:
response = requests.get(url)
response.status_code

200

In [8]:
soup=BeautifulSoup(response.content, "html.parser")

#### how to scrape multiple pages ? 

looking at the selected urls we have a new page for each year 
2000,2001,2002,2003...2009

so, we will define a start variable that goes into the url 
(having looked at what the next page url looks like)

example from last time : 
    
- iterations = range(1,502,50)


In [11]:
iterations = range(2000,2010,1)

In [12]:
#check the iteration / range works 
for i in iterations:
    print(i)

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009


In [15]:
#create the full url string based on the page search 
#(and test the urls with the iteration)
f_url=f"https://www.uk-charts.top-source.info/top-100-{i}.shtml"

'https://www.uk-charts.top-source.info/top-100-2009.shtml'

In [16]:
[i for i in iterations]

[2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009]

In [17]:
url_lst=[f"https://www.uk-charts.top-source.info/top-100-{i}.shtml" for i in iterations]

In [18]:
#test these to confirm our url method
url_lst

['https://www.uk-charts.top-source.info/top-100-2000.shtml',
 'https://www.uk-charts.top-source.info/top-100-2001.shtml',
 'https://www.uk-charts.top-source.info/top-100-2002.shtml',
 'https://www.uk-charts.top-source.info/top-100-2003.shtml',
 'https://www.uk-charts.top-source.info/top-100-2004.shtml',
 'https://www.uk-charts.top-source.info/top-100-2005.shtml',
 'https://www.uk-charts.top-source.info/top-100-2006.shtml',
 'https://www.uk-charts.top-source.info/top-100-2007.shtml',
 'https://www.uk-charts.top-source.info/top-100-2008.shtml',
 'https://www.uk-charts.top-source.info/top-100-2009.shtml']

### respectful scraping 

In [19]:
from time import sleep 

In [20]:
#simple example 
#for i in range(5):
    #print(i)
    #sleep(3)

In [23]:
#enhance with random integers 
from random import randint
#for i in range(5):
    #print(i)
    #wait_time=randint(1,4)
    #print("i will sleep for "+ str(wait_time) +"seconds")
    #sleep(wait_time)

### putting everything together 

In [25]:
#we are building everything from scratch here 

pages= []

for i in iterations: 
    #takes the iteration, turns it into a string (not needed)
    start_at = str(i)
    #define the url
    url= f"https://www.uk-charts.top-source.info/top-100-{start_at}.shtml"
    # download the url content 
    response = requests.get(url)
    #print status code 
    print("Status code: "+ str(response.status_code))
    #to capture the data and fill in the list
    pages.append(response)
    #add a respectful nap 
    wait_time=randint(1,3)
    sleep(wait_time)

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200


In [26]:
soup=BeautifulSoup(pages[0].content, "html.parser")

### next steps - tidying up 
next thing to think about is can we reduce how much useless html we have scraped - maybe its not necessary to store it all in the list ? 

what we want eventually is song name, artist, year, index

- by inspecting the webpage- identify from the html how to get the information we want for the soup

- paste selector from a chosen song

#ContentColumn > div > table > tbody > tr:nth-child(1) > td:nth-child(3) 

- same process for artist

#ContentColumn > div > table > tbody > tr:nth-child(1) > td:nth-child(2)

In [31]:
#you can try changing the nth child number to run down the list on the page
soup.select("#ContentColumn > div > table > tbody > tr:nth-child(1) > td:nth-child(3)")

[<td>Who Let The Dogs Out</td>]

In [45]:
#you can trim the css selector to get the same results- only the text
soup.select("td:nth-child(3)")[0].text
#changing the index goes to the next song 

'Who Let The Dogs Out'

In [None]:
#this means we can select all songs by trimming the code 
soup.select("td:nth-child(3)")

In [None]:
#same for the singers 
soup.select("td:nth-child(2)")

### putting it all together 
this is what we need 

- soup.select("td:nth-child(3)")
- soup.select("td:nth-child(2)")
- soup=BeautifulSoup(pages[0].content, "html.parser")

In [57]:
# plus ...
pages
# for every page make a soup
# apply select functions
# from results extract songs
# append to the list song and singer

[<Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>]

In [58]:
song= []
singer = []
pages_parsed = []

for i in range(len(pages)):
    pages_parsed.append(BeautifulSoup(pages[i].content, "html.parser"))
    songs_html = pages_parsed[i].select("td:nth-child(3)")
    singers_html = pages_parsed[i].select("td:nth-child(2)")
    
    for j in range(len(songs_html)):
        song.append(songs_html[j].get_text())
        singer.append(singers_html[j].get_text())
    

In [59]:
#testing to check the numbers retrieved 
len(song), len(pages_parsed), len(singer)

(1000, 10, 1000)

### Next step 
Now we have our for loop to collect the information, lets complete the task by converting our lists to a df 

In [71]:
#making a pandas df from our data 

import pandas as pd 
songs =pd.DataFrame({'song':song, 'singer':singer})
songs.head(200)

Unnamed: 0,song,singer
0,Who Let The Dogs Out,Baha Men
1,It Feels So Good,Sonique
2,The Real Slim Shady,Eminem
3,Rise,Gabrielle
4,Pure Shores,All Saints
...,...,...
195,19 / 2000,Gorillaz
196,Ain't It Funny,Jennifer Lopez
197,Things I've Seen,Spooks
198,Electric Avenue,Eddy Grant
