<a href="https://colab.research.google.com/github/Juanezm/uoc-data-science-tfm/blob/main/data_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download and concatenate the CSV files:

In [None]:
import pandas as pd
import requests
import os

from google.colab import data_table
data_table.enable_dataframe_formatter()


base_url = "https://zenodo.org/record/3727310/files/"

download_dir = "../src/data"

sensor_ids = [
    '200034001951343334363036',
    '270043001951343334363036',
    '380033001951343334363036',
    '46004e000251353337353037',
    '46005a000351353337353037',
    '4e0022000251353337353037',
    '4e0031000251353337353037',
]

sensor_location = {
    '200034001951343334363036': (40.1138985, -0.0519082),
    '270043001951343334363036': (40.133098, -0.061),
    '380033001951343334363036': (40.20687, 0.015536),
    '46004e000251353337353037': (40.1138985, -0.0519082),
    '46005a000351353337353037': (40.167529, -0.097165),
    '4e0022000251353337353037': (40.1138985, -0.0519082),
    '4e0031000251353337353037': (40.141384, -0.026397)
}

sensors_info = [
    {
        "name": "Temperature",
        "manufacturer": "SparkFun",
        "model": "Si7021",
        "data_interface": "Analog",
        "units": "Centigrade",
        "range": [-10, 85],
        "accuracy": "+/- 0.4 degrees (C)",
        "variable": "air_temperature_raw"
    },
    {
        "name": "Humidity",
        "manufacturer": "SparkFun",
        "model": "Si7021",
        "data_interface": "Analog",
        "units": "Percentage",
        "range": [0, 80],
        "accuracy": "+/- 3 RH",
        "variable": "humidity_raw"
    },
    {
        "name": "Barometric pressure",
        "manufacturer": "SparkFun",
        "model": "MPL3115A2",
        "data_interface": "I2C",
        "units": "Hectopascal",
        "range": [500, 1100],
        "accuracy": "+/- 0.04 hPa",
        "variable": "atmospheric_pressure_raw"
    },
    {
        "name": "Soil moisture",
        "manufacturer": "SparkFun",
        "model": "DS18B20",
        "data_interface": "Analog",
        "units": "Percentage",
        "range": [0, 85],
        "accuracy": "+/- 0.5 RH",
        "variable": "soil_humidity_raw"
    },
    {
        "name": "Wind speed",
        "manufacturer": "SparkFun",
        "model": "SEN08942",
        "data_interface": "Analog (RJ11)",
        "units": "km/h",
        "range": "N/A",
        "accuracy": "N/A",
        "variable": "wind_speed_raw"
    },
    {
        "name": "Wind direction",
        "manufacturer": "SparkFun",
        "model": "SEN08942",
        "data_interface": "Analog (RJ11)",
        "units": "Direction (degrees)",
        "range": [-1, 7],
        "accuracy": "N/A",
        "variable": "wind_direction_raw"
    },
    {
        "name": "Rain meter",
        "manufacturer": "SparkFun",
        "model": "SEN08942",
        "data_interface": "Analog (RJ11)",
        "units": "millilitres (mm)",
        "range": [-1, 7],
        "accuracy": "N/A",
        "variable": "precipitation_raw"
    },
    {
        "name": "Battery",
        "manufacturer": "N/A",
        "model": "N/A",
        "data_interface": "N/A",
        "units": "Percentage",
        "range": [0, 100],
        "accuracy": "N/A",
        "variable": "battery_raw"
    }
]


def download_csv(sensor_id, variable):
    file_name = f"{sensor_id}_{variable}.csv"
    url = f"{base_url}{file_name}"
    response = requests.get(url)

    with open(f"{download_dir}/{file_name}", "wb") as f:
        f.write(response.content)

def read_csv(sensor_id, variable):
    file_name = f"{download_dir}/{sensor_id}_{variable}.csv"
    df = pd.read_csv(file_name, header=None, names=['date', 'value'])
    return df

for sensor_id in sensor_ids:
    dataframes = []
    sensor_location_value = sensor_location.get(sensor_id, '')  # Get the location value for the sensor

    for info in sensors_info:
        if not os.path.isfile(f"{download_dir}/{sensor_id}_{info['variable']}.csv"):
            download_csv(sensor_id, info['variable'])
        df = read_csv(sensor_id, info['variable'])
        df['variable'] = info['name']
        df['manufacturer'] = info['manufacturer']
        df['model'] = info['model']
        df['data_interface'] = info['data_interface']
        df['units'] = info['units']
        df['range'] = str(info['range'])
        df['accuracy'] = info['accuracy']
        df['geo_lat'] = sensor_location_value[0] 
        df['geo_lon'] = sensor_location_value[1]
        dataframes.append(df)

    # Concatenate all dataframes into a single one
    df = pd.concat(dataframes, ignore_index=True)

    # Save each sensor_id's data to a separate CSV
    df.to_csv(f'{download_dir}/{sensor_id}_raw_data.csv', index=False)