In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from random import randint
from time import sleep

![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# 6.01 Lab | Web Scraping Single Page

#### Business goal:

- Check the `case_study_gnod.md` file.
- Make sure you've understood the big picture of your project:

  - the goal of the company (`Gnod`),
  - their current product (`Gnoosic`),
  - their strategy, and
  - how your project fits into this context.

  Re-read the business case and the e-mail from the CTO, take a look at the flowchart and create an initial Trello board with the tasks you think you'll have to accomplish.

#### Instructions - Scraping popular songs

Your product will take a song as an input from the user and will output another song (the recommendation). In most cases, the recommended song will have to be similar to the inputted song, but the CTO thinks that if the song is on the top charts at the moment, the user will enjoy more a recommendation of a song that's also popular at the moment.

You have find data on the internet about currently popular songs. Billboard maintains a weekly Top 100 of "hot" songs here: [https://www.billboard.com/charts/hot-100](https://www.billboard.com/charts/hot-100).

It's a good place to start! Scrape the current top 100 songs and their respective artists, and put the information into a pandas dataframe.


In [2]:
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [3]:
response = requests.get(url)
response.status_code

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
soup

In [6]:
artist = []
song = []
genre = []
year = []

num_iter = len("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item")

songart = soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item")
genlist = soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item > div.chart-content")
yearlist = soup.select("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item > div.chart-content > ul > li:nth-child(2)")

for i in range(num_iter):
    artist.append(songart[i].em.get_text())
    song.append(songart[i].cite.get_text())
    year.append(yearlist[i].get_text())    
    try:
        genre.append(genlist[i].ul.li.a.get_text())
    except:
        genre.append('Unknown')

    

In [7]:
top100 = pd.DataFrame({'artist':artist
                    ,'track':song
                    ,'genre':genre
                    ,'year':year})

In [8]:
top100['genre'].value_counts()


Pop              43
Country          22
New Release      13
Hip-Hop / Rap     8
Rock              3
Unknown           3
Dance             2
Latin             2
Heavy Metal       1
R&B / Soul        1
Soundtrack        1
Alternative       1
Name: genre, dtype: int64

In [9]:
top100

Unnamed: 0,artist,track,genre,year
0,Lizzo,About Damn Time,Pop,"Release Date: April 14, 2022"
1,Kate Bush,Running Up That Hill (A Deal with God),Pop,"Release Date: August 5, 1985"
2,P!nk,Irrelevant,New Release,Genre: Pop
3,Slipknot,The Dying Song (Time To Sing),New Release,Genre: Heavy Metal
4,Walker Hayes,Y'all Life,New Release,Genre: Country
...,...,...,...,...
95,Dua Lipa,Levitating (feat. DaBaby),Pop,"Release Date: October 1, 2020"
96,Journey & Steve Perry,Separate Ways (Worlds Apart) [Steve Perry & Br...,Rock,"Release Date: July 3, 2022"
97,Bad Bunny & Chencho Corleone,Me Porto Bonito,Latin,"Release Date: May 6, 2022"
98,Bruno Mars,When I Was Your Man,Pop,"Release Date: December 7, 2012"


In [10]:
def genresplit(value):
    if 'Genre:' in value:
        name = value.split('Genre: ')[1]
        
        return name
    else:
        return value

In [11]:
top100['genre'] = top100['genre'].apply(genresplit)
top100['year'] = top100['year'].apply(genresplit)

In [12]:
def tidy(value):

    type1 = value['genre']
    type2 = value['year']
    if str(type1) == 'New Release':
        return type2
    else:
        return type1

    
top100['genre'] = top100.apply(tidy,axis=1)

In [13]:
top100['year'] = top100['year'].str.split(", ", n = 1, expand = True)[1]
top100['year'] = top100['year'].fillna(2022)

In [14]:
top100

Unnamed: 0,artist,track,genre,year
0,Lizzo,About Damn Time,Pop,2022
1,Kate Bush,Running Up That Hill (A Deal with God),Pop,1985
2,P!nk,Irrelevant,Pop,2022
3,Slipknot,The Dying Song (Time To Sing),Heavy Metal,2022
4,Walker Hayes,Y'all Life,Country,2022
...,...,...,...,...
95,Dua Lipa,Levitating (feat. DaBaby),Pop,2020
96,Journey & Steve Perry,Separate Ways (Worlds Apart) [Steve Perry & Br...,Rock,2022
97,Bad Bunny & Chencho Corleone,Me Porto Bonito,Latin,2022
98,Bruno Mars,When I Was Your Man,Pop,2012


![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# 6.02 Lab | Web Scraping Multiple Pages

#### Business goal:

- Check the `case_study_gnod.md` file.
- Make sure you've understood the big picture of your project:

  - the goal of the company (`Gnod`),
  - their current product (`Gnoosic`),
  - their strategy, and
  - how your project fits into this context.

  Re-read the business case and the e-mail from the CTO, take a look at the flowchart and create an initial Trello board with the tasks you think you'll have to accomplish.

#### Instructions 

#### Prioritize the MVP

In the previous lab, you had to scrape data about "hot songs". It's critical to be on track with that part, as it was part of the request from the CTO.

If you couldn't finish the first lab, use this time to go back there.

#### Expand the project

If you're done, you can try to expand the project on your own. Here are a few suggestions:

- Find other lists of hot songs on the internet and scrape them too: having a bigger pool of songs will be awesome!
- Apply the same logic to other "groups" of songs: the best songs from a decade or from a country / culture / language / genre.
- Wikipedia maintains a large collection of lists of songs: https://en.wikipedia.org/wiki/Lists_of_songs

#### Practice web scraping

As you've seen, scraping the internet is a skill that can get you all sorts of information. Here are some little challenges that you can try to gain more experience in the field:

- Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: `url ='https://en.wikipedia.org/wiki/Python'`
- Find the number of titles that have changed in the United States Code since its last release point: `url = 'http://uscode.house.gov/download/download.shtml'`
- Create a Python list with the top ten FBI's Most Wanted names: `url = 'https://www.fbi.gov/wanted/topten'`
- Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe: `url = 'https://www.emsc-csem.org/Earthquake/'`
- List all language names and number of related articles in the order they appear in [wikipedia.org](wikipedia.org): `url = 'https://www.wikipedia.org/'`
- A list with the different kind of datasets available in [data.gov.uk](data.gov.uk): `url = 'https://data.gov.uk/'`
- Display the top 10 languages by number of native speakers stored in a pandas dataframe: `url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'`

In [15]:
urls = "https://spinditty.com/playlists/100songs"

In [16]:
responses = requests.get(urls)
responses.status_code

200

In [17]:
sopa = BeautifulSoup(responses.content, "html.parser")

In [18]:
songartist2 = []
genre2 = []
years2 = []
num_iter = len("body > div.container > div:nth-child(4) > div.col-xs-12.col-md-8 > div.chart-wrapper > div.feed-item")

songarts = sopa.select("body > phoenix-page > div > div > div.m-page > section > article > div > div > section > div > div > h2")
genres = sopa.select("body > phoenix-page > div > div > div.m-page > section > article > div > div > section > div > div > p")
years = sopa.select("body > phoenix-page > div > div > div.m-page > section > article > div > div > section > div > div > h3")
for i in range(len(songarts)):
    songartist2.append(songarts[i].get_text())
    
for i in range(len(genres)):
    genre2.append(genres[i].get_text())
        
for i in range(len(years)):
    years2.append(years[i].get_text())


In [19]:
len(years2)

100

In [20]:
genrefin = []
for i in genre2:
    if 'Genre:' in i:
        genrefin.append(i)
    elif 'GENRE:' in i:
        genrefin.append(i)
        
songartfin = []        
for j in songartist2:
    if '#' in j:
        songartfin.append(j)
        
        
lap = []
for i in years2:
    if len(i) > 4:
        i = re.findall('\d{4}',i)
        lap.append(i)
    else:
        lap.append(i)
years2 = lap
        
    
    


In [21]:
top100s = pd.DataFrame({'go':songartfin})

In [22]:
top100s = top100s.drop_duplicates().reset_index()


In [23]:
top100s = pd.DataFrame({'go': top100s['go'], 'genre':genrefin, 'year':years2})


In [24]:
top100s

Unnamed: 0,go,genre,year
0,"#100 - ""Bitter Sweet Symphony"" by The Verve",Genre: Britpop,[1997]
1,"#99: ""For What It's Worth"" by Buffalo Springfield",Genre: Folk Rock,1967
2,"#98 ""Fire and Rain"" by James Taylor",Genre: Folk Rock,[1970]
3,"#97: ""Rolling in the Deep"" by Adele",Genre: Pop,[2011]
4,"#96 ""(We're Gonna) Rock Around the Clock"" by B...",Genre: Rock & Roll,1954
...,...,...,...
95,"#5: ""Stairway to Heaven""",Genre: Rock,[1971]
96,"#4: ""Imagine"" by John Lennon",Genre: Rock,[1971]
97,"#3: ""Like a Rolling Stone"" by Bob Dylan",Genre: Folk Rock,[1965]
98,"#2: ""Yesterday"" by The Beatles",Genre: British Invasion,[1965]


In [25]:
top100s['track'] = top100s['go'].str.split("by", n = 1, expand = True)[0]
top100s['artist'] = top100s['go'].str.split("by", n = 1, expand = True)[1]



In [26]:
top100s['artist'].value_counts()

 The Beatles                   6
 Elvis Presley                 4
 The Beach Boys                3
 The Rolling Stones            3
 The Who                       2
                              ..
 Black Sabbath                 1
 R.E.M.                        1
 Martha and the Vandellas      1
 Bob Marley and The Wailers    1
 John Lennon                   1
Name: artist, Length: 78, dtype: int64

In [27]:
top100s = top100s[['track','artist','genre','year']]

In [28]:
top100s

Unnamed: 0,track,artist,genre,year
0,"#100 - ""Bitter Sweet Symphony""",The Verve,Genre: Britpop,[1997]
1,"#99: ""For What It's Worth""",Buffalo Springfield,Genre: Folk Rock,1967
2,"#98 ""Fire and Rain""",James Taylor,Genre: Folk Rock,[1970]
3,"#97: ""Rolling in the Deep""",Adele,Genre: Pop,[2011]
4,"#96 ""(We're Gonna) Rock Around the Clock""",Bill Haley and His Comets,Genre: Rock & Roll,1954
...,...,...,...,...
95,"#5: ""Stairway to Heaven""",,Genre: Rock,[1971]
96,"#4: ""Imagine""",John Lennon,Genre: Rock,[1971]
97,"#3: ""Like a Rolling Stone""",Bob Dylan,Genre: Folk Rock,[1965]
98,"#2: ""Yesterday""",The Beatles,Genre: British Invasion,[1965]


In [29]:
top100s['track'] = top100s['track'].str.split('"', n = 1, expand = True)[1]
top100s['genre'] = top100s['genre'].str.split(': ', n = 1, expand = True)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top100s['track'] = top100s['track'].str.split('"', n = 1, expand = True)[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top100s['genre'] = top100s['genre'].str.split(': ', n = 1, expand = True)[1]


In [30]:
#top100s['year'].str.split("(", n = 1, expand = True)[1]

In [31]:
top100s

Unnamed: 0,track,artist,genre,year
0,"Bitter Sweet Symphony""",The Verve,Britpop,[1997]
1,"For What It's Worth""",Buffalo Springfield,Folk Rock,1967
2,"Fire and Rain""",James Taylor,Folk Rock,[1970]
3,"Rolling in the Deep""",Adele,Pop,[2011]
4,"(We're Gonna) Rock Around the Clock""",Bill Haley and His Comets,Rock & Roll,1954
...,...,...,...,...
95,"Stairway to Heaven""",,Rock,[1971]
96,"Imagine""",John Lennon,Rock,[1971]
97,"Like a Rolling Stone""",Bob Dylan,Folk Rock,[1965]
98,"Yesterday""",The Beatles,British Invasion,[1965]


In [32]:
#for i in top100s['year']:
#         int(i)

In [33]:
top100s['track'] = top100s['track'].str.replace('"','')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top100s['track'] = top100s['track'].str.replace('"','')


In [34]:
top100s

Unnamed: 0,track,artist,genre,year
0,Bitter Sweet Symphony,The Verve,Britpop,[1997]
1,For What It's Worth,Buffalo Springfield,Folk Rock,1967
2,Fire and Rain,James Taylor,Folk Rock,[1970]
3,Rolling in the Deep,Adele,Pop,[2011]
4,(We're Gonna) Rock Around the Clock,Bill Haley and His Comets,Rock & Roll,1954
...,...,...,...,...
95,Stairway to Heaven,,Rock,[1971]
96,Imagine,John Lennon,Rock,[1971]
97,Like a Rolling Stone,Bob Dylan,Folk Rock,[1965]
98,Yesterday,The Beatles,British Invasion,[1965]


In [35]:
yearconvert = []

#if it's a list:
for i in top100s['year']:
    if isinstance(i,list):
        yearconvert.append(i[0])
    else:
        yearconvert.append(i)

top100s['year'] = yearconvert

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top100s['year'] = yearconvert


In [36]:
top100s
        

Unnamed: 0,track,artist,genre,year
0,Bitter Sweet Symphony,The Verve,Britpop,1997
1,For What It's Worth,Buffalo Springfield,Folk Rock,1967
2,Fire and Rain,James Taylor,Folk Rock,1970
3,Rolling in the Deep,Adele,Pop,2011
4,(We're Gonna) Rock Around the Clock,Bill Haley and His Comets,Rock & Roll,1954
...,...,...,...,...
95,Stairway to Heaven,,Rock,1971
96,Imagine,John Lennon,Rock,1971
97,Like a Rolling Stone,Bob Dylan,Folk Rock,1965
98,Yesterday,The Beatles,British Invasion,1965


In [37]:
top100s = top100s[['track','artist','genre','year']]
top100 = top100[['track','artist','genre','year']]
top = pd.concat([top100,top100s], axis = 0).reset_index()
top = top.drop(columns='index')

In [38]:
top

Unnamed: 0,track,artist,genre,year
0,About Damn Time,Lizzo,Pop,2022
1,Running Up That Hill (A Deal with God),Kate Bush,Pop,1985
2,Irrelevant,P!nk,Pop,2022
3,The Dying Song (Time To Sing),Slipknot,Heavy Metal,2022
4,Y'all Life,Walker Hayes,Country,2022
...,...,...,...,...
195,Stairway to Heaven,,Rock,1971
196,Imagine,John Lennon,Rock,1971
197,Like a Rolling Stone,Bob Dylan,Folk Rock,1965
198,Yesterday,The Beatles,British Invasion,1965


# Lab 6.03 

In [39]:
import random
def recommend():
    song= input('input a song ')
    if song in list(top['track']):
        print(top['track'].iloc[random.randint(0,200)])
    elif song not in list(top['track']):
        print("No recommendation")
recommend()

input a song Names
Get Up, Stand UP 


# Lab 6.04

In [47]:
secrets_file = open("secrets.txt","r")

In [None]:
string = secrets_file.read()
string.split('\n')

In [49]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        secrets_dict[line.split(':')[0]]=line.split(':')[1]

In [50]:

#InitializeSpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['cid'],
                                                           client_secret=secrets_dict['csecret']))

In [51]:
def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3))
    return tracks

In [52]:
playlist1 = get_playlist_tracks("6tIxyT1Gq6O7DK7rIEUEZo")
playlist2 = get_playlist_tracks("4pbDDX7np7Q1H0ghL7U9o7")


In [54]:
print('no. 1    ',len(playlist1))
print('no. 2    ',len(playlist2))
print('total is',len(playlist1)+len(playlist2))

no. 1     9999
no. 2     3878
total is 13877
