In [None]:
import requests

# Here we are requesting an API
response = requests.get("http://api.open-notify.org/astros.json").json()

# Here we are making a loop
for astronaut in response["people"]:
    print(astronaut["name"],"is in",astronaut["craft"])

# Data collection

The goal of this presentation is to automate data collection.
We need some historical data to build our model and predict electrical consumption in Paris for J+1.   
Basically we need to collect as much interesting data as possible, starting from nothing.

### Expected output

Datasets saved on our computer with file formats easily readable by python : csv, json, xml, excel

### Quizz (5 minutes) : What kind of data might be interesting to make a prediction ?

### Workshop (15 minutes) : Try to collect the data


### Easy : Historical electrical consumption

[Electrical consumption in île-de-France between 2013 and 2017](https://rte-opendata.opendatasoft.com/explore/dataset/eco2mix_regional_cons_def/export/?disjunctive.libelle_region&disjunctive.nature&sort=-date_heure&refine.libelle_region=Ile-de-France)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
consumption = pd.read_csv("./data/eco2mix_regional_cons_def.csv", delimiter=";",parse_dates=["Date - Heure"])
consumption.set_index('Date - Heure',inplace=True)

In [None]:
consumption.sort_index(inplace=True)
consumption.head(3)

In [None]:
resample = consumption.resample('D').count()
resample[resample["Code INSEE région"]<48]

### Less easy : Historical weather

Not enough data : [Prévision Météo - Paris - AROME](https://public.opendatasoft.com/explore/dataset/arome-0025-sp1_sp2_paris/export/)  
Let's pay for some data ! [Openweather map API](https://openweathermap.org/history-bulk) (10$ for 5 years of weather in paris : a bargain !)


In [None]:
weather = pd.read_csv("./data/meteo-paris.csv")
weather['dt'] = pd.to_datetime(weather['dt'],unit='s')
weather.set_index('dt',inplace=True)

In [None]:
weather.head()

In [None]:
print(weather.index.min())
print(weather.index.max())

In [None]:
resample = weather.resample('D').count()
resample.sample(10)

# Days off in France

No dataset easily available, we are going to scrap the web :
https://www.calendrier-365.fr

In [None]:
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup

days_off = []
for year in range(2012,2020):
    url = 'https://www.calendrier-365.fr/jours-feries/{}.html'.format(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.text,"lxml")
    for x in soup.find_all("td", {"class":"dtr tar"}):
        date = datetime.fromtimestamp(int(x.attrs["data-value"]))
        days_off.append(date.strftime("%Y-%m-%d"))

In [None]:
days_off

In [None]:
def is_day_off(date):
    """
    Function to tell if a day is off in France
    Only works from 2013 to 2020.
    """
    if date.strftime("%Y-%m-%d") in days_off:
        return True
    return False

In [None]:
import datetime

today = datetime.datetime.today() 

next_saturday = today + datetime.timedelta(days=2)
christmas = datetime.datetime(2018,12,25)
easter = datetime.datetime(2015,4,5)

print(is_day_off(next_saturday))
print(is_day_off(today))
print(is_day_off(christmas))
print(is_day_off(easter))

# Strikes in Paris
Copyright to William Revah

In [None]:
from bs4 import BeautifulSoup
import requests
import datetime

strikes = []

url = "https://fr.wikipedia.org/wiki/Liste_des_manifestations_les_plus_importantes_en_France"
response = requests.get(url)
soup = BeautifulSoup(response.text,"lxml")
for table in soup.find_all("table"):
    for x in table.find_all("tr"):
        date=x.find_next("time")
        strikes.append(date.attrs["datetime"])

print(strikes)
