In [2]:
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
import re
import urllib

In [3]:
class IronhackSpider:
    """
    This is the constructor class to which you can pass a bunch of parameters. 
    These parameters are stored to the class instance variables so that the
    class functions can access them later.
    
    url_pattern: the regex pattern of the web urls to scape
    pages_to_scrape: how many pages to scrape
    sleep_interval: the time interval in seconds to delay between requests. If <0, requests will not be delayed.
    content_parser: a function reference that will extract the intended info from the scraped content.
    """
    def __init__(self, url_pattern, pages_to_scrape=10, sleep_interval=-1, content_parser=None):
        self.url_pattern = url_pattern
        self.pages_to_scrape = pages_to_scrape
        self.sleep_interval = sleep_interval
        self.content_parser = content_parser
    
    """
    Scrape the content of a single url.
    """
    def scrape_url(self, url):
        try:
            response = requests.get(url, timeout=10)
            content = self.get_response_content(response)
            if not self.content_parser is None:
                result = self.content_parser(content)
            else:
                result = content
        except:
            result = None
        self.output_results(result)
        
    def get_response_content(self, r):
        if (r.status_code == 200):
            return r.content
        return False
    
    """
    Export the scraped content. Right now it simply print out the results.
    But in the future you can export the results into a text file or database.
    """
    def output_results(self, r):
        for v in r:
            with open('rank.csv','a',newline='', encoding='utf-8') as fichiercsv:
                writer=csv.writer(fichiercsv)
                writer.writerow(v)
#         print(r)
    
    """
    After the class is instantiated, call this function to start the scraping jobs.
    This function uses a FOR loop to call `scrape_url()` for each url to scrape.
    """
    def kickstart(self):
        global weeks
        for i in range(1, self.pages_to_scrape+1):
            try:
                if i < 10:
                        self.scrape_url(self.url_pattern[0] % i)
                else:
                        self.scrape_url(self.url_pattern[1] % i)
            except:
                weeks += 1
                pass
            if self.sleep_interval > 0:
                time.sleep(self.sleep_interval)


URL_PATTERN = ['http://www.chartsinfrance.net/charts/200%s/singles.php', 'http://www.chartsinfrance.net/charts/20%s/singles.php'] # regex pattern for the urls to scrape
PAGES_TO_SCRAPE = 52 # how many webpages to scrape
weeks = 1

"""
This is a custom parser function you will complete in the challenge.
Right now it simply returns the string passed to it. But in this lab
you will complete this function so that it extracts the quotes.
This function will be passed to the IronhackSpider class.
"""
def quotes_parser(content):
    soup = BeautifulSoup(content, "html")
    songs = [[elements.find_all('a')[0].text, elements.find_all('a')[1].text] for elements in soup.find_all('div',{'class':'b572'})]
    global weeks
    results = [[str(weeks), str(i + 1), songs[i][0], songs[i][1]] for i in range(50)]    
    weeks += 1
    return results

# Instantiate the IronhackSpider class
my_spider = IronhackSpider(URL_PATTERN, PAGES_TO_SCRAPE, content_parser=quotes_parser)

# Start scraping jobs

entete = ['Week', 'Top', 'Artist', 'Track']
f = open('rank.csv', 'w')
ligneEntete = ",".join(entete) + "\n"
f.write(ligneEntete)
f.close()

my_spider.kickstart()

In [4]:
top_fd = pd.read_csv('rank.csv')

In [5]:
temp_df = pd.read_csv('temperature-quotidienne-departementale.csv', sep=';')

In [6]:
temp_df['date_obs'] = pd.to_datetime(temp_df['date_obs'])

In [7]:
temp_df2 = temp_df[(temp_df['date_obs'] >= '27-12-2019') & (temp_df['date_obs'] <= '17-12-2020')]

In [8]:
temp_df2.sort_values(by='date_obs', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df2.sort_values(by='date_obs', inplace=True)


In [9]:
mean_temp_week = temp_df2.groupby(temp_df2.date_obs.dt.strftime('%W')).tmoy.mean()

In [10]:
mtw_df = pd.DataFrame([[i + 1, mean_temp_week[i]] for i in range(len(mean_temp_week))], columns = ['Week', 'Mean Temperature'])

In [None]:
top_tracks = pd.read_csv('rank.csv')
tracks = [tracks for tracks in top_tracks['Track']]

In [12]:
f = open('spotify_token.txt','r')
spotify_token = f.read()

In [13]:
def search_ids(track):
    track = urllib.parse.quote(track)
    try:
        search_track_json = requests.get(f'https://api.spotify.com/v1/search?q={track}&type=track&market=FR&limit=1', 
                                headers={'Accept': 'application/json', 
                                         "Content-Type": "application/json",
                                        'Authorization': f'Bearer {spotify_token}'}).json()
        return search_track_json['tracks']['items'][0]['id']
    except:
        pass

In [14]:
def find_features(ids):
    try:
        tracks_features_json = requests.get(f'https://api.spotify.com/v1/audio-features?ids={ids}', 
                                headers={'Accept': 'application/json', 
                                         "Content-Type": "application/json",
                                        'Authorization': f'Bearer {spotify_token}'}).json()
        return tracks_features_json['audio_features'][0]['valence']
    except:
        pass

In [15]:
track_id_val = [[track, search_ids(track), find_features(search_ids(track))] for track in set(tracks)]

In [16]:
tiv_df = pd.DataFrame(track_id_val, columns = ['Track', 'Id', 'Valence'])

In [22]:
tiv_df.head(10)

Unnamed: 0,Track,Id,Valence
0,Horizons Into Battlegrounds,1SSoLdnoptUGfs5kwgVQms,0.0993
1,The Other Side,2cKovq3l6OJjhVVDbVKOsr,0.31
2,Fever (avec Angèle),,
3,Tu es toujours là,5iuKYic7hUbMrOr27nzRuX,0.123
4,Les paradis perdus,6iZiygCxjFEhV8VWAP6GZy,0.191
5,Ça Pleure Aussi Un Homme,5a548EzcJXWye2MVBUbzNl,0.0751
6,Already,6tgEc2O1uFHcZDKPoo6PC8,0.548
7,Only The Young,2slqvGLwzZZYsT4K4Y1GBC,0.602
8,Sweet Night,39EXZNMxb4RBHlRjnRaOKp,0.149
9,Thirteen Thirtyfive,202DfCIz0fZV2SVhUNnIi4,0.36


In [18]:
mtw_df.head(3)

Unnamed: 0,Week,Mean Temperature
0,1,5.866167
1,2,7.097232
2,3,7.329673


In [19]:
top_fd.head(3)

Unnamed: 0,Week,Top,Artist,Track
0,1,1,The Weeknd,Blinding Lights
1,1,2,Tones and I,Dance Monkey
2,1,3,Angèle,Oui ou non
