# Analysis of Safecast radiation data

In [9]:
import pandas as pd
import plotly.express as px
#from scripts.preprocessing.process_measurements import *

## Data loading

### Load the dataset

In [10]:
data = pd.read_csv('data/measurements.csv')

df = pd.DataFrame(data)
df

Unnamed: 0,Device ID,Measurement Day,Latitude,Longitude,Height,Average Value,Prev Latitude,Prev Longitude,Prev Height,Movement,Height Difference,Status
0,0.0,2013-10-10,35.638275,139.709950,-inf,128.000000,,,,0.000000,,Stationary
1,0.0,2013-10-11,35.638275,139.709950,-inf,14.370000,35.638275,139.709950,-inf,0.000000,,Stationary
2,0.0,2013-10-12,35.638275,139.709950,-inf,14.860000,35.638275,139.709950,-inf,0.000000,,Stationary
3,0.0,2013-10-13,35.638275,139.709950,-inf,15.000000,35.638275,139.709950,-inf,0.000000,,Stationary
4,0.0,2013-10-14,35.638275,139.709950,-inf,15.060000,35.638275,139.709950,-inf,0.000000,,Stationary
...,...,...,...,...,...,...,...,...,...,...,...,...
382938,666666.0,2016-07-16,34.482707,136.165813,-inf,27.150000,34.482565,136.163749,-inf,190.320321,,Moving
382939,666666.0,2016-07-17,35.268946,136.249729,-inf,25.260000,34.482707,136.165813,-inf,87560.783327,,Moving
382940,666666.0,2016-07-18,35.371239,136.448062,-inf,24.906667,35.268946,136.249729,-inf,21308.385671,,Moving
382941,666666.0,2016-07-19,35.268985,136.249700,-inf,31.025000,35.371239,136.448062,-inf,21308.342527,,Moving


## Data preprocessing

### Drop incorrect measurements

In [11]:
df['Unit'].value_counts()

KeyError: 'Unit'

#### Incorrect unit names

We see that most measurements have the unit "cpm". The notation does not have the same case, so we can substitute corresponding units that are stored in different formats (` cpm`, `CPM` and `Cpm` is the same as `cpm`).

Also replace `microsivert` with `usv` as these are the same units.

In [7]:
# substitute ` cpm`, `CPM` and `Cpm` units with `cpm`
df['Unit'] = df['Unit'].str.replace(' cpm', 'cpm')
df['Unit'] = df['Unit'].str.replace('CPM', 'cpm')
df['Unit'] = df['Unit'].str.replace('Cpm', 'cpm')

# substitute micorsivert with usv
df['Unit'] = df['Unit'].str.replace('microsievert', 'usv')

df['Unit'].value_counts()

Unit
cpm            424470
celcius         86095
status          51182
usv              5524
PM10 ug/m3        735
DeviceType2       618
DeviceType1        34
HUMD%              20
1                  15
NOXppm              8
0                   5
pm2.5               3
PM1                 1
Name: count, dtype: int64

In [8]:
# Drop all row that use units with counts less than 100
df = df[df['Unit'].map(df['Unit'].value_counts()) > 100]

df['Unit'].value_counts()

Unit
cpm            424470
celcius         86095
status          51182
usv              5524
PM10 ug/m3        735
DeviceType2       618
Name: count, dtype: int64

#### Measurements with `DeviceType2` unit

Because measurements for the `DeviceType2` unit always have the same value and we don't know what the actual unit actually is, we can drop these measurements as well.

In [None]:
df[df['Unit'] == 'DeviceType2']['Average Value'].describe()

: 

In [None]:
df = df[df['Unit'] != 'DeviceType2']

: 

#### Measurements with `celcius` unit

Drop all measurements with the `celcius` unit as this unit is used to measure temperature, not radiation. (The unit should be named `celsius`, though)

In [None]:
df[df['Unit'] == 'celcius']['Average Value'].describe()

: 

In [None]:
df = df[df['Unit'] != 'celcius']

: 

#### Measurements with `status` unit

Let's see what we get for the `status` unit

In [None]:
df[df['Unit'] == 'status']['Average Value'].describe()

: 

And see how it compares to `usv` and `cpm` units. If the range is not even close to either of them, we will drop the `status` unit as it cannot be easily converted to the correct unit used for radiation measurement.

In [None]:
df[df['Unit'] == 'usv']['Average Value'].describe()

: 

In [None]:
df[df['Unit'] == 'cpm']['Average Value'].describe()

: 

Since the order of magnitude of average values with the `status` differs much from both `usv` and `cpm`, we will drop all of these measurements as well.

In [None]:
df = df[df['Unit'] != 'status']

: 

#### Measurements with `PM10 ug/m3` unit

This unit is used to measure air pollution, not radiation, so we can safely drop all rows with this unit as well.

In [None]:
df = df[df['Unit'] != 'PM10 ug/m3']

: 

### Standarize the measurement unit

Show all units after dropping incorrect measurements

In [None]:
df['Unit'].value_counts()

: 

We have 2 units left. We will convert the `cpm` unit to `usv`

In [None]:
def convert_to_cpm(row):
    if row['Unit'] == 'usv':
        # Assuming 1 µSv/h = 200 CPM
        return row['Average Value'] * 200
    elif row['Unit'] == 'cpm':
        return row['Average Value']

df['Average Value'] = df.apply(convert_to_cpm, axis=1)

# Since all data is now in CPM, we can drop the 'Unit' column
df.drop('Unit', axis=1, inplace=True)

df

: 

In [None]:
df['Average Value'].describe()

: 

### Drop outliers

: 

We can convert all `float64` types to `float32` for faster calculations

In [None]:
df.dtypes

: 

In [None]:
df['Device ID'] = df['Device ID'].astype('uint8')

to_float32 = ['Latitude', 'Longitude', 'Height', 'Average Value']
df[to_float32] = df[to_float32].astype('float32')

df['Measurement Day'] = pd.to_datetime(df['Measurement Day'])

df.dtypes

: 

## Data visualization

In [None]:
# Get average sensor measurements in the last year (Measurement day column is the day of the measurement, use all dates between 2024-01-01 and 2024-04-01)
df_2024 = df[(df['Measurement Day'] >= '2024-01-01') & (df['Measurement Day'] <= '2024-04-01')]
# Now average for the same location
df_2024 = df_2024.groupby(['Latitude', 'Longitude']).mean().reset_index()

df_2024

: 

In [None]:
fig = px.scatter_geo(
    df_2024, 
    lat='Latitude', 
    lon='Longitude', 
    color='Average Value',
    title='Radiation levels',
    color_continuous_scale=['green', 'yellow', 'red', 'purple'],
)
fig.update_layout(height=800)
fig.show()

: 

In [None]:
import requests
import numpy as np
from urllib.request import urlopen
from json.decoder import JSONDecodeError

def get_elevation(lat, long):
    try:
        query = f'https://api.open-elevation.com/api/v1/lookup?locations={lat},{long}'
        response = requests.get(query).json()
        elevation = response[0]['elevation']
        return elevation
    except (KeyError, IndexError):
        return None
    except JSONDecodeError:
        print("Error: Unable to decode JSON response. The API might be down or returning unexpected data.")
        return None

def fill_missing_height(df):
    missing_height_rows = df[pd.isnull(df['Height'])]
    for index, row in missing_height_rows.iterrows():
        lat, lon = row['Latitude'], row['Longitude']
        height = get_elevation(lat = lat, long = lon)
        if height is not None:
            df.at[index, 'Height'] = height


: 

In [None]:
final_df = fill_missing_height(df)

: 

In [None]:
print(final_df)

: 

In [None]:
final_df = final_df.groupby(['Measurement Day', 'Latitude', 'Longitude'])

: 