## Getting data from bucket as csv:

In [54]:
from dotenv import load_dotenv
import boto3
import os

load_dotenv()
access_key_id = os.environ.get("AWS_ACCESS_KEY_ID")
access_key_secret = os.environ.get("AWS_SECRET_ACCESS_KEY")

session = boto3.Session(
    aws_access_key_id=access_key_id,
    aws_secret_access_key=access_key_secret,
)

s3_client = boto3.client(
    's3',
    aws_access_key_id=access_key_id,
    aws_secret_access_key=access_key_secret
)

bucket = session.resource('s3').Bucket(name='emu-air-quality-bucket')

latest_object = list(bucket.objects.all())[-1]

with open("./download.csv", 'wb') as file:
    s3_client.download_fileobj(
        bucket.name,
        latest_object.key,
        file
    )





## Grabbing data from csv as DataFrame:

In [203]:
import pandas as pd
import re
import numpy as np

raw_data = pd.read_csv("./download.csv", dtype=str)

def get_application_id(val: str) -> str:
    my_regex = re.compile(r'.*\{application_id=(.*?)[\}\,]')
    return my_regex.search(val).group(1)

def get_eui(val: str) -> str:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*eui=(.*?)[\}\,]')
    return my_regex.search(val).group(1)

def get_co2(val: str) -> int:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*co2=(.*?)[\}\,]')
    return int(my_regex.search(val).group(1))

def get_hcho(val: str) -> float:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*hcho=(.*?)[\}\,]')
    return float(my_regex.search(val).group(1))

def get_humidity(val: str) -> float:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*humidity=(.*?)[\}\,]')
    return float(my_regex.search(val).group(1))

def get_light_level(val: str) -> int:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*light_level=(.*?)[\}\,]')
    return int(my_regex.search(val).group(1))

def get_pir(val: str) -> str:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*pir=(.*?)[\}\,]')
    return my_regex.search(val).group(1)

def get_pm10(val: str) -> int:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*pm10=(.*?)[\}\,]')
    return int(my_regex.search(val).group(1))

def get_pm2_5(val: str) -> int:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*pm2_5=(.*?)[\}\,]')
    return int(my_regex.search(val).group(1))

def get_pressure(val: str) -> float:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*pressure=(.*?)[\}\,]')
    return float(my_regex.search(val).group(1))

def get_temperature(val: str) -> float:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*temperature=(.*?)[\}\,]')
    return float(my_regex.search(val).group(1))

def get_tvoc(val: str) -> int:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*tvoc=(.*?)[\}\,]')
    return int(my_regex.search(val).group(1))

def get_measured_at(val: str) -> str:
    if "decoded_payload=null" in val:
        return np.nan
    my_regex = re.compile(r'.*received_at=(.*?)[\}\,]')
    return my_regex.search(val).group(1)



conversion_dict = {
    "received_at": {
        "column": "received_at",
        "function": lambda val: val
    },
    "application_id": {
        "column": "end_device_ids",
        "function": get_application_id
    },
    "eui": {
        "column": "uplink_message",
        "function": get_eui
    },
    "co2": {
        "column": "uplink_message",
        "function": get_co2
    },
    "hcho": {
        "column": "uplink_message",
        "function": get_hcho
    },
    "humidity": {
        "column": "uplink_message",
        "function": get_humidity
    },
    "light_level": {
        "column": "uplink_message",
        "function": get_light_level
    },
    "pir": {
        "column": "uplink_message",
        "function": get_pir
    },
    "pm10": {
        "column": "uplink_message",
        "function": get_pm10
    },
    "pm2_5": {
        "column": "uplink_message",
        "function": get_pm2_5
    },
    "pressure": {
        "column": "uplink_message",
        "function": get_pressure
    },
    "temperature": {
        "column": "uplink_message",
        "function": get_temperature
    },
    "tvoc": {
        "column": "uplink_message",
        "function": get_tvoc
    },
    "measured_at": {
        "column": "uplink_message",
        "function": get_measured_at
    },
}

data = pd.DataFrame({ key: raw_data[val["column"]].apply(val["function"]) for key, val in conversion_dict.items() })


In [204]:
data.head()

Unnamed: 0,received_at,application_id,eui,co2,hcho,humidity,light_level,pir,pm10,pm2_5,pressure,temperature,tvoc,measured_at
0,2023-02-27T20:21:21.715916790Z,office-air-quality,AC1F09FFFE053AD4,509.0,0.03,64.5,0.0,idle,1.0,1.0,1012.4,25.0,46.0,2023-02-27T20:21:21.506572182Z
1,2023-02-27T19:11:21.655983205Z,office-air-quality,AC1F09FFFE053AD4,514.0,0.03,64.0,0.0,idle,2.0,2.0,1011.7,25.3,50.0,2023-02-27T19:11:21.446701753Z
2,2023-02-27T07:41:21.325373825Z,office-air-quality,AC1F09FFFE053AD4,555.0,0.02,55.5,0.0,idle,3.0,3.0,1011.4,29.7,98.0,2023-02-27T07:41:21.120110061Z
3,2023-02-28T00:01:21.816636853Z,office-air-quality,AC1F09FFFE053AD4,410.0,0.01,60.5,1.0,idle,7.0,7.0,1013.1,27.3,29.0,2023-02-28T00:01:21.608323985Z
4,2023-02-27T08:46:21.354913416Z,office-air-quality,AC1F09FFFE053AD4,560.0,0.02,58.5,0.0,idle,3.0,3.0,1012.2,28.9,99.0,2023-02-27T08:46:21.149774395Z
