# Analysis of Safecast radiation data

In [117]:
import pandas as pd
import plotly.express as px
#from scripts.preprocessing.process_measurements import *

## Data loading

### Load the dataset

In [118]:
data = pd.read_csv('data/measurements.csv')

df = pd.DataFrame(data)
df

Unnamed: 0,Device ID,Measurement Day,Unit,Latitude,Longitude,Height,Average Value,Status
0,0.0,2013-10-10,cpm,35.638275,139.709950,15.070784,128.000000,Stationary
1,0.0,2013-10-11,cpm,35.638275,139.709950,15.070784,14.370000,Stationary
2,0.0,2013-10-12,cpm,35.638275,139.709950,15.070784,14.860000,Stationary
3,0.0,2013-10-13,cpm,35.638275,139.709950,15.070784,15.000000,Stationary
4,0.0,2013-10-14,cpm,35.638275,139.709950,15.070784,15.060000,Stationary
...,...,...,...,...,...,...,...,...
382938,666666.0,2016-07-16,cpm,34.482707,136.165813,519.890015,27.150000,Moving
382939,666666.0,2016-07-17,cpm,35.268946,136.249729,90.824211,25.260000,Moving
382940,666666.0,2016-07-18,cpm,35.371239,136.448062,181.197266,24.906667,Moving
382941,666666.0,2016-07-19,cpm,35.268985,136.249700,91.153488,31.025000,Moving


## Data preprocessing

### Drop incorrect measurements

In [119]:
df['Unit'].value_counts()

cpm             291618
status           51164
celcius          37653
usv               1553
PM10 ug/m3         735
microsievert       170
CPM                  8
HUMD%                8
1                    8
NOXppm               8
 cpm                 8
0                    5
pm2.5                3
Cpm                  1
RSSI                 1
Name: Unit, dtype: int64

#### Incorrect unit names

We see that most measurements have the unit "cpm". The notation does not have the same case, so we can substitute corresponding units that are stored in different formats (` cpm`, `CPM` and `Cpm` is the same as `cpm`).

Also replace `microsivert` with `usv` as these are the same units.

In [120]:
# substitute ` cpm`, `CPM` and `Cpm` units with `cpm`
df['Unit'] = df['Unit'].str.replace(' cpm', 'cpm')
df['Unit'] = df['Unit'].str.replace('CPM', 'cpm')
df['Unit'] = df['Unit'].str.replace('Cpm', 'cpm')

# substitute micorsivert with usv
df['Unit'] = df['Unit'].str.replace('microsievert', 'usv')

df['Unit'].value_counts()

cpm           291635
status         51164
celcius        37653
usv             1723
PM10 ug/m3       735
HUMD%              8
1                  8
NOXppm             8
0                  5
pm2.5              3
RSSI               1
Name: Unit, dtype: int64

In [121]:
# Drop all row that use units with counts less than 100
df = df[df['Unit'].map(df['Unit'].value_counts()) > 100]

df['Unit'].value_counts()

cpm           291635
status         51164
celcius        37653
usv             1723
PM10 ug/m3       735
Name: Unit, dtype: int64

#### Measurements with `DeviceType2` unit

Because measurements for the `DeviceType2` unit always have the same value and we don't know what the actual unit actually is, we can drop these measurements as well.

In [122]:
df[df['Unit'] == 'DeviceType2']['Average Value'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: Average Value, dtype: float64

In [123]:
df = df[df['Unit'] != 'DeviceType2']

#### Measurements with `celcius` unit

Drop all measurements with the `celcius` unit as this unit is used to measure temperature, not radiation. (The unit should be named `celsius`, though)

In [124]:
df[df['Unit'] == 'celcius']['Average Value'].describe()

count    37653.000000
mean        20.996135
std         28.537822
min       -503.320000
25%         15.850000
50%         23.055000
75%         29.180000
max       1103.470000
Name: Average Value, dtype: float64

In [125]:
df = df[df['Unit'] != 'celcius']

#### Measurements with `status` unit

Let's see what we get for the `status` unit

In [126]:
df[df['Unit'] == 'status']['Average Value'].describe()

count    51164.000000
mean        11.716317
std        113.406377
min      -1000.000000
25%         20.520000
50%         25.570000
75%         29.110000
max       3149.770000
Name: Average Value, dtype: float64

And see how it compares to `usv` and `cpm` units. If the range is not even close to either of them, we will drop the `status` unit as it cannot be easily converted to the correct unit used for radiation measurement.

In [127]:
df[df['Unit'] == 'usv']['Average Value'].describe()

count    1723.000000
mean        1.613322
std         7.891189
min         0.000000
25%         0.090000
50%         0.100000
75%         0.136667
max       115.000000
Name: Average Value, dtype: float64

In [128]:
df[df['Unit'] == 'cpm']['Average Value'].describe()

count    2.916350e+05
mean     3.106880e+11
std      1.678022e+14
min     -8.777799e+12
25%      1.540000e+01
50%      3.000000e+01
75%      4.117500e+01
max      9.061864e+16
Name: Average Value, dtype: float64

Since the order of magnitude of average values with the `status` differs much from both `usv` and `cpm`, we will drop all of these measurements as well.

In [129]:
df = df[df['Unit'] != 'status']

#### Measurements with `PM10 ug/m3` unit

This unit is used to measure air pollution, not radiation, so we can safely drop all rows with this unit as well.

In [130]:
df = df[df['Unit'] != 'PM10 ug/m3']

### Standarize the measurement unit

Show all units after dropping incorrect measurements

In [131]:
df['Unit'].value_counts()

cpm    291635
usv      1723
Name: Unit, dtype: int64

We have 2 units left. We will convert the `cpm` unit to `usv`

In [132]:
def convert_to_cpm(row):
    if row['Unit'] == 'usv':
        # Assuming 1 µSv/h = 200 CPM
        return row['Average Value'] * 200
    elif row['Unit'] == 'cpm':
        return row['Average Value']

df['Average Value'] = df.apply(convert_to_cpm, axis=1)

# Since all data is now in CPM, we can drop the 'Unit' column
df.drop('Unit', axis=1, inplace=True)

df

Unnamed: 0,Device ID,Measurement Day,Latitude,Longitude,Height,Average Value,Status
0,0.0,2013-10-10,35.638275,139.709950,15.070784,128.000000,Stationary
1,0.0,2013-10-11,35.638275,139.709950,15.070784,14.370000,Stationary
2,0.0,2013-10-12,35.638275,139.709950,15.070784,14.860000,Stationary
3,0.0,2013-10-13,35.638275,139.709950,15.070784,15.000000,Stationary
4,0.0,2013-10-14,35.638275,139.709950,15.070784,15.060000,Stationary
...,...,...,...,...,...,...,...
382938,666666.0,2016-07-16,34.482707,136.165813,519.890015,27.150000,Moving
382939,666666.0,2016-07-17,35.268946,136.249729,90.824211,25.260000,Moving
382940,666666.0,2016-07-18,35.371239,136.448062,181.197266,24.906667,Moving
382941,666666.0,2016-07-19,35.268985,136.249700,91.153488,31.025000,Moving


In [133]:
df['Average Value'].describe()

count    2.933580e+05
mean     3.088632e+11
std      1.673087e+14
min     -8.777799e+12
25%      1.543000e+01
50%      2.993000e+01
75%      4.116000e+01
max      9.061864e+16
Name: Average Value, dtype: float64

### Drop outliers

We can convert all `float64` types to `float32` for faster calculations

In [134]:
df.dtypes

Device ID          float64
Measurement Day     object
Latitude           float64
Longitude          float64
Height             float64
Average Value      float64
Status              object
dtype: object

In [135]:
df['Device ID'] = df['Device ID'].astype('uint8')

to_float32 = ['Latitude', 'Longitude', 'Height', 'Average Value']
df[to_float32] = df[to_float32].astype('float32')

df['Measurement Day'] = pd.to_datetime(df['Measurement Day'])

df.dtypes

Device ID                   uint8
Measurement Day    datetime64[ns]
Latitude                  float32
Longitude                 float32
Height                    float32
Average Value             float32
Status                     object
dtype: object

## Data visualization

In [136]:
# Get average sensor measurements in the last year (Measurement day column is the day of the measurement, use all dates between 2024-01-01 and 2024-04-01)
df_2024 = df[(df['Measurement Day'] >= '2024-01-01') & (df['Measurement Day'] <= '2024-04-01')]
# Now average for the same location
df_2024 = df_2024.groupby(['Latitude', 'Longitude']).mean().reset_index()

df_2024

Unnamed: 0,Latitude,Longitude,Device ID,Height,Average Value
0,-34.955399,138.625793,249.0,75.336220,29.000000
1,0.000000,0.000000,8.0,0.000000,4.100000
2,22.284821,114.139999,157.0,41.319691,73.803856
3,27.826099,-82.628899,224.0,1.738461,12.629286
4,31.833193,130.301926,125.5,13.000000,20.720636
...,...,...,...,...,...
153,51.961971,5.858359,237.0,10.751446,31.920000
154,51.961971,5.858360,237.0,10.751446,31.792000
155,51.980701,9.234500,108.0,108.135483,15.754745
156,52.427601,4.971100,205.0,-4.432384,21.521219


In [137]:
fig = px.scatter_geo(
    df_2024, 
    lat='Latitude', 
    lon='Longitude', 
    color='Average Value',
    title='Radiation levels',
    color_continuous_scale=['green', 'yellow', 'red', 'purple'],
)
fig.update_layout(height=800)
fig.show()

In [146]:
import pandas as pd
import plotly.graph_objects as go
import random

df['Measurement Day'] = pd.to_datetime(df['Measurement Day'], unit='s')

grouped = df.groupby(['Device ID', 'Measurement Day'])['Average Value'].mean().reset_index()
random_device_id = random.sample(list(grouped['Device ID'].unique()), 5)

fig = go.Figure()

for device_id in random_device_id:
    device_data = grouped[grouped['Device ID'] == device_id]
    device_id_str = str(device_id)
    fig.add_trace(go.Scatter(x=device_data['Measurement Day'], y=device_data['Average Value'], mode='lines', name=device_id_str))

fig.update_layout(
    title='Average Value on Measurement Day per Device ID',
    xaxis=dict(title='Measurement Day'),
    yaxis=dict(title='Average Value', type='log', range=[0, None]),
    hovermode='x',
    showlegend=True,
    margin=dict(l=0, r=0, t=30, b=0),
    height=600 
)

fig.show()
