In [1]:
import requests
import pandas as pd
from io import StringIO, BytesIO
from zipfile import ZipFile
import time
import os
from concurrent.futures import ThreadPoolExecutor
import multiprocessing as mp

<br>

# Web scraping for electricity consumption

In [2]:
download_location = "C:/Users/marta/OneDrive/Documents/Máster/apuntes/TFM/electricity"

regions = ["Auvergne-Rhône-Alpes", 
            "Bourgogne-Franche-Comté", 
            "Bretagne", 
            "Centre-Val-de-Loire", 
            "Grand-Est", 
            "Hauts-de-France", 
            "Ile-de-France", 
            "Normandie", 
            "Nouvelle-Aquitaine", 
            "Occitanie", 
            "PACA", 
            "Pays-de-la-Loire"]

years = range(2013, 2021)
cols = ['Périmètre', 'Nature', 'Date', 'Heures', 'Consommation', 'Thermique', 'Nucléaire', 'Eolien', 'Solaire', 'Hydraulique', 'Pompage', 'Bioénergies', 'Ech. physiques']

<br>

## Download files

In [4]:
dfs = []

for region in regions:
    region_path = os.path.join(download_location, region)
    if not os.path.exists(region_path):
        os.makedirs(region_path)
        
    for year in years:
        if region == 'Auvergne-Rhône-Alpes':
            link = f"https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Auvergne-Rh%C3%B4ne-Alpes_Annuel-Definitif_{year}.zip"
        elif region == 'Bourgogne-Franche-Comté':
            link = f"https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Bourgogne-Franche-Comt%C3%A9_Annuel-Definitif_{year}.zip"
        else: 
            link = f"https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_{region}_Annuel-Definitif_{year}.zip"
        
        print(f'\nDownloading {link}')
        
        response = requests.get(link)
        
        if response.status_code == 200:
            print('Download successful.')
        else:
            print(f'Error downloading file. Status code: {response.status_code}')
            continue
    
        zip_file = ZipFile(BytesIO(response.content))
        
        for file_name in zip_file.namelist():
            if file_name.endswith('.xls'):
                print(f'Extracting and saving {file_name}')
                with zip_file.open(file_name) as f:
                    xls_content = f.read()
                    
                    with open(os.path.join(region_path, f'{region}_{year}.csv'), 'wb') as csv_file:
                        csv_file.write(xls_content)
                    print(f'{file_name} saved successfully.')
                    
                    file_path = os.path.join(region_path, f'{region}_{year}.csv')
                    if os.path.exists(file_path):
                        print(f'Reading {file_path}')
                        try:
                            df = pd.read_csv(file_path, encoding='ISO-8859-1', delimiter='\t', na_values=['-', 'ND'], usecols=range(len(cols)+1), header=0)
                            df = df.dropna(subset=['Consommation'])
                            df = df.loc[:, cols]
                        except UnicodeDecodeError:
                            print(f'Decoding error reading {file_path}')
                            continue

                        dfs.append(df)


Downloading https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Auvergne-Rh%C3%B4ne-Alpes_Annuel-Definitif_2013.zip
Download successful.
Extracting and saving eCO2mix_RTE_Auvergne-Rhône-Alpes_Annuel-Definitif_2013.xls
eCO2mix_RTE_Auvergne-Rhône-Alpes_Annuel-Definitif_2013.xls saved successfully.
Reading C:/Users/marta/OneDrive/Documents/Máster/apuntes/TFM/electricity\Auvergne-Rhône-Alpes\Auvergne-Rhône-Alpes_2013.csv

Downloading https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Auvergne-Rh%C3%B4ne-Alpes_Annuel-Definitif_2014.zip
Download successful.
Extracting and saving eCO2mix_RTE_Auvergne-Rhône-Alpes_Annuel-Definitif_2014.xls
eCO2mix_RTE_Auvergne-Rhône-Alpes_Annuel-Definitif_2014.xls saved successfully.
Reading C:/Users/marta/OneDrive/Documents/Máster/apuntes/TFM/electricity\Auvergne-Rhône-Alpes\Auvergne-Rhône-Alpes_2014.csv

Downloading https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Auvergne-Rh%C3%B4ne-Alpes_Annuel-Definitif_2015.zip
Download succes

<br>

In [5]:
final_df = pd.concat(dfs, ignore_index=True)
final_df.head(5)

Unnamed: 0,Périmètre,Nature,Date,Heures,Consommation,Thermique,Nucléaire,Eolien,Solaire,Hydraulique,Pompage,Bioénergies,Ech. physiques
0,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,00:30,8173.0,252.0,12069.0,257.0,0.0,3079.0,-4.0,68.0,-7551.0
1,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,01:00,7944.0,252.0,11895.0,257.0,0.0,2828.0,-328.0,67.0,-7029.0
2,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,01:30,7896.0,252.0,12228.0,247.0,0.0,2710.0,-327.0,65.0,-7282.0
3,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,02:00,7882.0,251.0,12274.0,244.0,0.0,2516.0,-328.0,66.0,-7144.0
4,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,02:30,7909.0,251.0,12228.0,245.0,0.0,2440.0,-406.0,67.0,-6920.0


<br>

In [6]:
final_df['time'] = final_df['Date'] + ' ' + final_df['Heures']
final_df['time'] = pd.to_datetime(final_df['time'])
final_df = final_df.drop(columns=['Date', 'Heures'])

final_df.head(5)

Unnamed: 0,Périmètre,Nature,Consommation,Thermique,Nucléaire,Eolien,Solaire,Hydraulique,Pompage,Bioénergies,Ech. physiques,time
0,Auvergne-Rhône-Alpes,Données définitives,8173.0,252.0,12069.0,257.0,0.0,3079.0,-4.0,68.0,-7551.0,2013-01-01 00:30:00
1,Auvergne-Rhône-Alpes,Données définitives,7944.0,252.0,11895.0,257.0,0.0,2828.0,-328.0,67.0,-7029.0,2013-01-01 01:00:00
2,Auvergne-Rhône-Alpes,Données définitives,7896.0,252.0,12228.0,247.0,0.0,2710.0,-327.0,65.0,-7282.0,2013-01-01 01:30:00
3,Auvergne-Rhône-Alpes,Données définitives,7882.0,251.0,12274.0,244.0,0.0,2516.0,-328.0,66.0,-7144.0,2013-01-01 02:00:00
4,Auvergne-Rhône-Alpes,Données définitives,7909.0,251.0,12228.0,245.0,0.0,2440.0,-406.0,67.0,-6920.0,2013-01-01 02:30:00


<br>

## Missing value: 01/01/2013  0:00

To solve this, we will replicate the data we have from 01/01/2013  0:30.

In [7]:
data_00_30 = final_df[(final_df['time'] == '2013-01-01 00:30:00')]
data_00_00 = data_00_30.copy()
data_00_00['time'] = pd.to_datetime('2013-01-01 00:00:00')

final_df = pd.concat([final_df, data_00_00])
final_df = final_df.sort_values(by=['Périmètre', 'time'])

final_df.head(5)

Unnamed: 0,Périmètre,Nature,Consommation,Thermique,Nucléaire,Eolien,Solaire,Hydraulique,Pompage,Bioénergies,Ech. physiques,time
0,Auvergne-Rhône-Alpes,Données définitives,8173.0,252.0,12069.0,257.0,0.0,3079.0,-4.0,68.0,-7551.0,2013-01-01 00:00:00
0,Auvergne-Rhône-Alpes,Données définitives,8173.0,252.0,12069.0,257.0,0.0,3079.0,-4.0,68.0,-7551.0,2013-01-01 00:30:00
1,Auvergne-Rhône-Alpes,Données définitives,7944.0,252.0,11895.0,257.0,0.0,2828.0,-328.0,67.0,-7029.0,2013-01-01 01:00:00
2,Auvergne-Rhône-Alpes,Données définitives,7896.0,252.0,12228.0,247.0,0.0,2710.0,-327.0,65.0,-7282.0,2013-01-01 01:30:00
3,Auvergne-Rhône-Alpes,Données définitives,7882.0,251.0,12274.0,244.0,0.0,2516.0,-328.0,66.0,-7144.0,2013-01-01 02:00:00


<br>

In [8]:
output_file = os.path.join(download_location, "eco2mix_data.csv")
final_df.to_csv(output_file, encoding='utf-8-sig', index=False) 
print("Final DataFrame saved to eco2mix_data.csv")

Final DataFrame saved to eco2mix_data.csv


<br>

Now, we have:

In [9]:
df = pd.read_csv(os.path.join(download_location, "eco2mix_data.csv"))
df['Périmètre'].value_counts()

Auvergne-Rhône-Alpes       140256
Bourgogne-Franche-Comté    140256
Bretagne                   140256
Centre-Val de Loire        140256
Grand-Est                  140256
Hauts-de-France            140256
Ile-de-France              140256
Normandie                  140256
Nouvelle-Aquitaine         140256
Occitanie                  140256
PACA                       140256
Pays-de-la-Loire           140256
Name: Périmètre, dtype: int64

<br>

## Check for hour change

In [24]:
download_location = "C:/Users/marta/OneDrive/Documents/Máster/apuntes/TFM/electricity/Auvergne-Rhône-Alpes"
df = pd.read_csv(os.path.join(download_location, "Auvergne-Rhône-Alpes_2013.csv"), encoding='ISO-8859-1', delimiter='\t', na_values=['-', 'ND'])
df = df.dropna(subset=['Consommation'])
df = df.loc[:, cols]

df['time'] = df['Date'] + ' ' + df['Heures']
df['time'] = pd.to_datetime(df['time'])
    
df.head(5)

Unnamed: 0,Périmètre,Nature,Date,Heures,Consommation,Thermique,Nucléaire,Eolien,Solaire,Hydraulique,Pompage,Bioénergies,Ech. physiques,time
2,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,00:30,8173.0,252.0,12069.0,257.0,0.0,3079.0,-4.0,68.0,-7551.0,2013-01-01 00:30:00
4,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,01:00,7944.0,252.0,11895.0,257.0,0.0,2828.0,-328.0,67.0,-7029.0,2013-01-01 01:00:00
6,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,01:30,7896.0,252.0,12228.0,247.0,0.0,2710.0,-327.0,65.0,-7282.0,2013-01-01 01:30:00
8,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,02:00,7882.0,251.0,12274.0,244.0,0.0,2516.0,-328.0,66.0,-7144.0,2013-01-01 02:00:00
10,Auvergne-Rhône-Alpes,Données définitives,2013-01-01,02:30,7909.0,251.0,12228.0,245.0,0.0,2440.0,-406.0,67.0,-6920.0,2013-01-01 02:30:00


<br>

In [25]:
duplicates = df[df.duplicated(subset=['time'], keep=False)]
if len(duplicates) > 0:
    print("Duplicate times were found:")
    print(duplicates)
else:
    print("No duplicates were found.")

No duplicates were found.


<br>

In [26]:
time_diff = df['time'].diff()

if any(time_diff > pd.Timedelta(hours=1)):
    print("A difference of more than one hour between adjacent rows was found.")
else:
    print("No differences of more than one hour between adjacent rows were found.")

No differences of more than one hour between adjacent rows were found.


<br>

# Web scraping for weather data

In [21]:
download_location = "C:/Users/marta/OneDrive/Documents/Máster/apuntes/TFM/weather"

coords = {
    "Auvergne-Rhône-Alpes": [
        ("Annecy", 46.19, 6.15),
        ("Chambéry", 45.57, 6.92),
        ("Lyon", 45.75, 4.85),
        ("Grenoble", 45.17, 5.72),
        ("Clermont-Ferrand", 45.5, 3.88),
        ("Valence", 44.83, 4.75),
        ("Saint-Étienne", 45.22, 4.35),
        ("Roanne", 46.03, 4.07),
        ("Saint-Chamond", 45.03, 4.88),
        ("Thonon-les-Bains", 46.2, 6.08)
    ],
    "Bourgogne-Franche-Comté": [
        ("Dijon", 47.16, 4.84),
        ("Besançon", 46.78, 5.24),
        ("Nevers", 47.02, 3.34),
        ("Lons-le-Saunier", 46.55, 5.32),
        ("Mâcon", 46.21, 4.8),
        ("Auxerre", 47.3, 5.04),
        ("Pontarlier", 46.82, 6.13),
        ("Chalon-sur-Saône", 47.48, 4.08),
        ("Bourg-en-Bresse", 46.17, 4.83),
        ("Montbéliard", 47.38, 6.13)
    ],
    "Bretagne": [
        ("Rennes", 48.11, -1.68),
        ("Vannes", 47.62, -3.35),
        ("Brest", 48.41, -4.48),
        ("Lorient", 48.38, -2.38),
        ("Quimper", 48.07, -3.44),
        ("Saint-Brieuc", 48.58, -2.75),
        ("Morlaix", 47.93, -4.14),
        ("Redon", 47.56, -2.79),
        ("Dinan", 48.72, -1.55),
        ("Pontivy", 47.23, -3.48)
    ],
    "Centre-Val-de-Loire": [
        ("Orléans", 47.59, 1.68),
        ("Tours", 47.32, 1.92),
        ("Blois", 47.18, 2.12),
        ("Chartres", 48.08, 2.41),
        ("Bourges", 47.81, 3.72),
        ("Châteauroux", 46.84, 2.13),
        ("Vendôme", 47.02, 0.63),
        ("La Flèche", 47.44, 0.62),
        ("Amboise", 47.41, 0.98),
        ("Gien", 47.61, 3.96)
    ],
    "Grand-Est": [
        ("Estrasburgo", 48.71, 7.75),
        ("Metz", 49.61, 5.31),
        ("Nancy", 48.58, 6.18),
        ("Colmar", 48.23, 7.35),
        ("Thionville", 49.18, 6.13),
        ("Bar-le-Duc", 48.97, 5.95),
        ("Verdun", 48.07, 5.55),
        ("Saverne", 48.48, 7.57),
        ("Sarreguemines", 49.33, 7.4),
        ("Sélestat", 48.13, 7.23)
    ],
    "Hauts-de-France": [
        ("Lille", 50.63, 3.07),
        ("Valenciennes", 50.38, 3.92),
        ("Amiens", 50.46, 1.68),
        ("Calais", 50.72, 2.08),
        ("Reims", 49.44, 3.03),
        ("Laon", 49.83, 2.38),
        ("Douai", 50.17, 4.33),
        ("Charleville-Mézières", 49.55, 4.03),
        ("Boulogne-sur-Mer", 50.08, 1.25),
        ("Compiègne", 49.22, 2.82)
    ],
    "Ile-de-France": [
        ("París", 48.86, 2.35),
        ("Versailles", 48.71, 2.21),
        ("Saint-Denis", 48.98, 2.42),
        ("Créteil", 48.83, 2.08),
        ("Évry-Courcouronnes", 48.69, 2.43),
        ("Fontainebleau", 48.58, 2.38),
        ("Meaux", 48.96, 2.77),
        ("Melun", 49.03, 2.55),
        ("Pontoise", 48.79, 2.52),
        ("Argenteuil", 48.94, 2.37)
    ],
    "Normandie": [
        ("Rouen", 49.44, -1.07),
        ("Caen", 49.17, -0.35),
        ("Le Havre", 48.85, -1.75),
        ("Dieppe", 49.68, -1.08),
        ("Cherburgo", 49.33, -0.6),
        ("Alençon", 48.58, -0.42),
        ("Saint-Lô", 49.05, -1.25),
        ("Évreux", 48.7, -0.95),
        ("Lisieux", 49.25, -1.45),
        ("Flers", 48.93, -1.5)
    ],
    "Nouvelle-Aquitaine": [
        ("Burdeos", 45.77, -1.05),
        ("Bayona", 44.83, -0.58),
        ("Périgueux", 45.17, 0.68),
        ("Poitiers", 46.17, 1.15),
        ("Biarritz", 43.58, -1.45),
        ("Angulema", 45.5, 0.08),
        ("Mont-de-Marsan", 44.35, 0.35),
        ("Agen", 45.33, -0.33),
        ("Niort", 46.25, -0.33),
        ("Limoges", 45.92, 0.43)
    ],
    "Occitanie": [
        ("Toulouse", 43.6, 1.44),
        ("Montpellier", 43.58, 3.83),
        ("Nimes", 44.03, 4.82),
        ("Albi", 43.97, 1.08),
        ("Perpignan", 42.72, 2.55),
        ("Rodez", 44.35, 0.35),
        ("Montauban", 44.08, 0.75),
        ("Béziers", 43.75, 3.33),
        ("Carcassonne", 43.25, 1.83),
        ("Millau", 44.09, 3)
    ],
    "PACA": [
        ("Niza", 43.73, 7.27),
        ("Marsella", 43.3, 5.37),
        ("Avignon", 43.55, 4.82),
        ("Toulon", 43.12, 5.93),
        ("Gap", 44.56, 6.08),
        ("Arles", 43.68, 4.63),
        ("Draguignan", 43.54, 6.47),
        ("Aix-en-Provence", 43.47, 5.67),
        ("Grasse", 43.66, 6.92),
        ("Cannes", 43.67, 7.3)
    ],
    "Pays-de-la-Loire": [
        ("Nantes", 47.22, -1.55),
        ("Mayenne", 48.3, -0.7),
        ("Angers", 47.08, -2.38),
        ("Le Mans", 46.8, -1.7),
        ("La Roche-sur-Yon", 46.25, -1.15),
        ("Saint-Nazaire", 47.17, -2.17),
        ("Cholet", 47.33, -1.92),
        ("Laval", 46.95, -1.45),
        ("Saumur", 46.75, -2.05),
        ("Ancenis", 47.37, -1.18),
    ]
}

url_base = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "start_date": "2013-01-01",
    "end_date": "2020-12-31",
    "hourly": "temperature_2m,relative_humidity_2m,apparent_temperature,weather_code",
    "format": "csv"
}

<br>

## Download files

In [22]:
for region, cities in coords.items():
    for city, latitude, longitude in cities:
        params["latitude"] = latitude
        params["longitude"] = longitude
        url = url_base + "?" + "&".join(f"{k}={v}" for k, v in params.items())
        print("URL:", url)
        response = requests.get(url)
        if response.status_code == 200:
            file_name = f"{city}.csv"
            file_path = os.path.join(download_location, file_name)
            with open(file_path, 'wb') as f:
                f.write(response.content)

            print(f"Data successfully downloaded for {city}\n")
            time.sleep(30)
        else:
            print(f"Could not download data for {city} \n")
            time.sleep(10000)

URL: https://archive-api.open-meteo.com/v1/archive?start_date=2013-01-01&end_date=2020-12-31&hourly=temperature_2m,relative_humidity_2m,apparent_temperature,weather_code&format=csv&latitude=46.03&longitude=4.07
Data successfully downloaded for Roanne

URL: https://archive-api.open-meteo.com/v1/archive?start_date=2013-01-01&end_date=2020-12-31&hourly=temperature_2m,relative_humidity_2m,apparent_temperature,weather_code&format=csv&latitude=47.41&longitude=0.98
Data successfully downloaded for Amboise

URL: https://archive-api.open-meteo.com/v1/archive?start_date=2013-01-01&end_date=2020-12-31&hourly=temperature_2m,relative_humidity_2m,apparent_temperature,weather_code&format=csv&latitude=43.12&longitude=5.93
Data successfully downloaded for Toulon

URL: https://archive-api.open-meteo.com/v1/archive?start_date=2013-01-01&end_date=2020-12-31&hourly=temperature_2m,relative_humidity_2m,apparent_temperature,weather_code&format=csv&latitude=44.56&longitude=6.08
Data successfully downloaded for

<br>

## Add half hours

In [None]:
dfs = []

for region, cities in coords.items():
    for city, latitude, longitude in cities:
        file_name = f"{city}.csv"
        file_path = os.path.join(download_location, file_name)
        df = pd.read_csv(file_path, skiprows=3)
        df["region"] = region
        df["city"] = city
        df['time'] = pd.to_datetime(df['time'])

        new_df = pd.DataFrame(columns=df.columns)

        for i in range(len(df)-1):
            current_time = df['time'][i]
            next_time = current_time + pd.Timedelta(minutes=30)
            new_row = {
                'time': next_time,
                'temperature_2m (°C)': (df['temperature_2m (°C)'][i] + df['temperature_2m (°C)'][i+1]) / 2,
                'relative_humidity_2m (%)': (df['relative_humidity_2m (%)'][i] + df['relative_humidity_2m (%)'][i+1]) / 2,
                'apparent_temperature (°C)': (df['apparent_temperature (°C)'][i] + df['apparent_temperature (°C)'][i+1]) / 2,
                'weather_code (wmo code)': (df['weather_code (wmo code)'][i] + df['weather_code (wmo code)'][i+1]) // 2
            }
            new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)

        df = pd.concat([df, new_df], ignore_index=True)
        df = df.sort_values(by='time')

        dfs.append(df)
        print(f"Data successfully downloaded for {city}\n")

<br>

### Parallelization 
Code executed in spyder.

In [None]:
import os
import pandas as pd
import multiprocessing as mp

def process(arg):
    download_location = "C:/Users/marta/OneDrive/Documents/Máster/apuntes/TFM/weather"
    
    region, city = arg.split(" & ")
    
    file_name = f"{city}.csv"
    file_path = os.path.join(download_location, file_name)
    df = pd.read_csv(file_path, skiprows=3)
    
    df["region"] = region
    df['city'] = city
    df['time'] = pd.to_datetime(df['time'])

    new_df = pd.DataFrame(columns=df.columns)

    for i in range(len(df)-1):
        if i<20:
            current_time = df['time'][i]
            next_time = current_time + pd.Timedelta(minutes=30)
            new_row = {
                'time': next_time,
                'temperature_2m (°C)': (df['temperature_2m (°C)'][i] + df['temperature_2m (°C)'][i+1]) / 2,
                'relative_humidity_2m (%)': (df['relative_humidity_2m (%)'][i] + df['relative_humidity_2m (%)'][i+1]) / 2,
                'apparent_temperature (°C)': (df['apparent_temperature (°C)'][i] + df['apparent_temperature (°C)'][i+1]) / 2,
                'weather_code (wmo code)': (df['weather_code (wmo code)'][i] + df['weather_code (wmo code)'][i+1]) // 2,
                'region': region,
                'city': city
            }
            new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)
    
    df = pd.concat([df, new_df], ignore_index=True)
    df = df.sort_values(by='time')
    
    return df

if __name__ == "__main__":
    
    download_location = "C:/Users/marta/OneDrive/Documents/Máster/apuntes/TFM/weather"

    coords = {
        "Auvergne-Rhône-Alpes": [
            ("Annecy", 46.19, 6.15),
            ("Chambéry", 45.57, 6.92),
            ("Lyon", 45.75, 4.85),
            ("Grenoble", 45.17, 5.72),
            ("Clermont-Ferrand", 45.5, 3.88),
            ("Valence", 44.83, 4.75),
            ("Saint-Étienne", 45.22, 4.35),
            ("Roanne", 46.03, 4.07),
            ("Saint-Chamond", 45.03, 4.88),
            ("Thonon-les-Bains", 46.2, 6.08)
        ],
        "Bourgogne-Franche-Comté": [
            ("Dijon", 47.16, 4.84),
            ("Besançon", 46.78, 5.24),
            ("Nevers", 47.02, 3.34),
            ("Lons-le-Saunier", 46.55, 5.32),
            ("Mâcon", 46.21, 4.8),
            ("Auxerre", 47.3, 5.04),
            ("Pontarlier", 46.82, 6.13),
            ("Chalon-sur-Saône", 47.48, 4.08),
            ("Bourg-en-Bresse", 46.17, 4.83),
            ("Montbéliard", 47.38, 6.13)
        ],
        "Bretagne": [
            ("Rennes", 48.11, -1.68),
            ("Vannes", 47.62, -3.35),
            ("Brest", 48.41, -4.48),
            ("Lorient", 48.38, -2.38),
            ("Quimper", 48.07, -3.44),
            ("Saint-Brieuc", 48.58, -2.75),
            ("Morlaix", 47.93, -4.14),
            ("Redon", 47.56, -2.79),
            ("Dinan", 48.72, -1.55),
            ("Pontivy", 47.23, -3.48)
        ],
        "Centre-Val-de-Loire": [
            ("Orléans", 47.59, 1.68),
            ("Tours", 47.32, 1.92),
            ("Blois", 47.18, 2.12),
            ("Chartres", 48.08, 2.41),
            ("Bourges", 47.81, 3.72),
            ("Châteauroux", 46.84, 2.13),
            ("Vendôme", 47.02, 0.63),
            ("La Flèche", 47.44, 0.62),
            ("Amboise", 47.41, 0.98),
            ("Gien", 47.61, 3.96)
        ],
        "Grand-Est": [
            ("Estrasburgo", 48.71, 7.75),
            ("Metz", 49.61, 5.31),
            ("Nancy", 48.58, 6.18),
            ("Colmar", 48.23, 7.35),
            ("Thionville", 49.18, 6.13),
            ("Bar-le-Duc", 48.97, 5.95),
            ("Verdun", 48.07, 5.55),
            ("Saverne", 48.48, 7.57),
            ("Sarreguemines", 49.33, 7.4),
            ("Sélestat", 48.13, 7.23)
        ],
        "Hauts-de-France": [
            ("Lille", 50.63, 3.07),
            ("Valenciennes", 50.38, 3.92),
            ("Amiens", 50.46, 1.68),
            ("Calais", 50.72, 2.08),
            ("Reims", 49.44, 3.03),
            ("Laon", 49.83, 2.38),
            ("Douai", 50.17, 4.33),
            ("Charleville-Mézières", 49.55, 4.03),
            ("Boulogne-sur-Mer", 50.08, 1.25),
            ("Compiègne", 49.22, 2.82)
        ],
        "Ile-de-France": [
            ("París", 48.86, 2.35),
            ("Versailles", 48.71, 2.21),
            ("Saint-Denis", 48.98, 2.42),
            ("Créteil", 48.83, 2.08),
            ("Évry-Courcouronnes", 48.69, 2.43),
            ("Fontainebleau", 48.58, 2.38),
            ("Meaux", 48.96, 2.77),
            ("Melun", 49.03, 2.55),
            ("Pontoise", 48.79, 2.52),
            ("Argenteuil", 48.94, 2.37)
        ],
        "Normandie": [
            ("Rouen", 49.44, -1.07),
            ("Caen", 49.17, -0.35),
            ("Le Havre", 48.85, -1.75),
            ("Dieppe", 49.68, -1.08),
            ("Cherburgo", 49.33, -0.6),
            ("Alençon", 48.58, -0.42),
            ("Saint-Lô", 49.05, -1.25),
            ("Évreux", 48.7, -0.95),
            ("Lisieux", 49.25, -1.45),
            ("Flers", 48.93, -1.5)
        ],
        "Nouvelle-Aquitaine": [
            ("Burdeos", 45.77, -1.05),
            ("Bayona", 44.83, -0.58),
            ("Périgueux", 45.17, 0.68),
            ("Poitiers", 46.17, 1.15),
            ("Biarritz", 43.58, -1.45),
            ("Angulema", 45.5, 0.08),
            ("Mont-de-Marsan", 44.35, 0.35),
            ("Agen", 45.33, -0.33),
            ("Niort", 46.25, -0.33),
            ("Limoges", 45.92, 0.43)
        ],
        "Occitanie": [
            ("Toulouse", 43.6, 1.44),
            ("Montpellier", 43.58, 3.83),
            ("Nimes", 44.03, 4.82),
            ("Albi", 43.97, 1.08),
            ("Perpignan", 42.72, 2.55),
            ("Rodez", 44.35, 0.35),
            ("Montauban", 44.08, 0.75),
            ("Béziers", 43.75, 3.33),
            ("Carcassonne", 43.25, 1.83),
            ("Millau", 44.09, 3)
        ],
        "PACA": [
            ("Niza", 43.73, 7.27),
            ("Marsella", 43.3, 5.37),
            ("Avignon", 43.55, 4.82),
            ("Toulon", 43.12, 5.93),
            ("Gap", 44.56, 6.08),
            ("Arles", 43.68, 4.63),
            ("Draguignan", 43.54, 6.47),
            ("Aix-en-Provence", 43.47, 5.67),
            ("Grasse", 43.66, 6.92),
            ("Cannes", 43.67, 7.3)
        ],
        "Pays-de-la-Loire": [
            ("Nantes", 47.22, -1.55),
            ("Mayenne", 48.3, -0.7),
            ("Angers", 47.08, -2.38),
            ("Le Mans", 46.8, -1.7),
            ("La Roche-sur-Yon", 46.25, -1.15),
            ("Saint-Nazaire", 47.17, -2.17),
            ("Cholet", 47.33, -1.92),
            ("Laval", 46.95, -1.45),
            ("Saumur", 46.75, -2.05),
            ("Ancenis", 47.37, -1.18),
        ]
    }
    
    args_list = []
    
    for region, cities in coords.items():
        for city, lat, lon in cities: 
            args_list.append(f"{region} & {city}")
        
    num_processes = mp.cpu_count()
    pool = mp.Pool(num_processes)
    
    results = pool.map(process, args_list)
    final_df = pd.concat(results, ignore_index=True)
    
    print(final_df)
   
    output_file = os.path.join(download_location, "weather_data.csv")
    final_df.to_csv(output_file, encoding='utf-8-sig', index=False)     
    
    pool.close()
    pool.join()

<br>

### Example of introducing half hour data

In [25]:
df = pd.read_csv(os.path.join(download_location, "Annecy.csv"), skiprows=3)

df['time'] = pd.to_datetime(df['time'])
new_df = pd.DataFrame(columns=df.columns)

for i in range(10):
    current_time = df['time'][i]
    next_time = current_time + pd.Timedelta(minutes=30)
    new_row = {
        'time': next_time,
        'temperature_2m (°C)': (df['temperature_2m (°C)'][i] + df['temperature_2m (°C)'][i+1]) / 2,
        'relative_humidity_2m (%)': (df['relative_humidity_2m (%)'][i] + df['relative_humidity_2m (%)'][i+1]) / 2,
        'apparent_temperature (°C)': (df['apparent_temperature (°C)'][i] + df['apparent_temperature (°C)'][i+1]) / 2,
        'weather_code (wmo code)': (df['weather_code (wmo code)'][i] + df['weather_code (wmo code)'][i+1]) // 2
    }
    new_df = pd.concat([new_df, pd.DataFrame([new_row])], ignore_index=True)

df = pd.concat([df, new_df], ignore_index=True)
df = df.sort_values(by='time')

df.head(5)

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),apparent_temperature (°C),weather_code (wmo code)
0,2013-01-01 00:00:00,1.5,83.0,-1.5,2
70128,2013-01-01 00:30:00,1.4,83.5,-1.65,1
1,2013-01-01 01:00:00,1.3,84.0,-1.8,1
70129,2013-01-01 01:30:00,1.25,84.0,-1.85,1
2,2013-01-01 02:00:00,1.2,84.0,-1.9,1


<br>

## Result

In [34]:
df = pd.read_csv(os.path.join(download_location, "weather_data.csv"))

In [35]:
df.head(5)

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),apparent_temperature (°C),weather_code (wmo code),region,city
0,2013-01-01 00:00:00,1.5,83.0,-1.5,2,Auvergne-Rhône-Alpes,Annecy
1,2013-01-01 00:30:00,1.4,83.5,-1.65,1,Auvergne-Rhône-Alpes,Annecy
2,2013-01-01 01:00:00,1.3,84.0,-1.8,1,Auvergne-Rhône-Alpes,Annecy
3,2013-01-01 01:30:00,1.25,84.0,-1.85,1,Auvergne-Rhône-Alpes,Annecy
4,2013-01-01 02:00:00,1.2,84.0,-1.9,1,Auvergne-Rhône-Alpes,Annecy


<br>

## Missing value: 31/12/2020 23:30

In [37]:
df.tail(5)

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),apparent_temperature (°C),weather_code (wmo code),region,city
16830595,2020-12-31 21:00:00,2.0,96.0,-0.9,0,Pays-de-la-Loire,Ancenis
16830596,2020-12-31 21:30:00,1.8,97.0,-0.95,0,Pays-de-la-Loire,Ancenis
16830597,2020-12-31 22:00:00,1.6,98.0,-1.0,0,Pays-de-la-Loire,Ancenis
16830598,2020-12-31 22:30:00,1.4,99.0,-1.1,1,Pays-de-la-Loire,Ancenis
16830599,2020-12-31 23:00:00,1.2,100.0,-1.2,3,Pays-de-la-Loire,Ancenis


<br>

To solve this, we will replicate the data we have from 31/12/2020 23:00

In [57]:
df = pd.read_csv(os.path.join(download_location, "weather_data.csv"), parse_dates=['time'])

data_23_00 = df[(df['time'] == '2020-12-31 23:00:00')]

data_23_30 = data_23_00.copy()
data_23_30['time'] = pd.to_datetime('2020-12-31 23:30:00')

df = pd.concat([df, data_23_30])
df = df.sort_values(by=['time'])
df = df.sort_values(by=['region', 'city'])

df.tail(5)

Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),apparent_temperature (°C),weather_code (wmo code),region,city
16690341,2020-12-31 21:30:00,-1.45,93.0,-5.15,0,Pays-de-la-Loire,Saumur
16690342,2020-12-31 22:00:00,-1.5,93.0,-5.1,0,Pays-de-la-Loire,Saumur
16690343,2020-12-31 22:30:00,-1.8,93.5,-5.5,0,Pays-de-la-Loire,Saumur
16690344,2020-12-31 23:00:00,-2.1,94.0,-5.9,0,Pays-de-la-Loire,Saumur
16690344,2020-12-31 23:30:00,-2.1,94.0,-5.9,0,Pays-de-la-Loire,Saumur


<br>

In [65]:
output_file = os.path.join(download_location, "weather_data.csv")
df.to_csv(output_file, encoding='utf-8-sig', index=False)

<br>

Now, we have:

In [70]:
df = pd.read_csv(os.path.join(download_location, "weather_data.csv"))
df['city'].value_counts()

Annecy            140256
Chambéry          140256
Poitiers          140256
Niort             140256
Mont-de-Marsan    140256
                   ...  
Gien              140256
Châteauroux       140256
Chartres          140256
Bourges           140256
Saumur            140256
Name: city, Length: 120, dtype: int64

<br>

## Check for hour change

In [71]:
df = pd.read_csv(os.path.join(download_location, "Annecy.csv"), skiprows=3)

df['time'] = pd.to_datetime(df['time'])

duplicates = df[df.duplicated(subset=['time'], keep=False)]
if len(duplicates) > 0:
    print("Duplicate times were found:")
    print(duplicates)
else:
    print("No duplicates were found.")

No duplicates were found.


<br>

In [72]:
time_diff = df['time'].diff()

if any(time_diff > pd.Timedelta(hours=1)):
    print("A difference of more than one hour between adjacent rows was found.")
else:
    print("No differences of more than one hour between adjacent rows were found.")

No differences of more than one hour between adjacent rows were found.
