# Analysis of Safecast radiation data

In [5]:
import pandas as pd
import plotly.express as px

## Data loading

### Load the dataset

In [6]:
data = pd.read_csv('data/measurements_daily.csv')

df = pd.DataFrame(data)
df

Unnamed: 0,Device ID,Unit,Latitude,Longitude,Height,Measurement Day,Average Value
0,26.0,usv,-48.980217,69.433594,,1969-12-31,1.73
1,26.0,usv,18.975000,72.825800,,1969-12-31,0.59
2,26.0,usv,19.993998,73.861084,,1969-12-31,0.00
3,26.0,usv,19.616718,74.234619,,1969-12-31,0.04
4,26.0,usv,20.004322,73.921509,,1969-12-31,0.16
...,...,...,...,...,...,...,...
568706,100309.0,status,37.217980,141.000970,9.0,2024-04-13,29.91
568707,100321.0,cpm,37.752099,140.470826,60.0,2024-04-13,39.21
568708,100322.0,cpm,37.752099,140.470826,60.0,2024-04-13,0.05
568709,100139.0,status,37.495123,140.995519,94.0,2024-04-13,23.45


## Data preprocessing

### Drop incorrect measurements

In [7]:
df['Unit'].value_counts()

Unit
cpm             424447
celcius          86095
status           51182
usv               5337
PM10 ug/m3         735
DeviceType2        618
microsievert       187
DeviceType1         34
HUMD%               20
1                   15
 cpm                10
CPM                  8
NOXppm               8
0                    5
Cpm                  5
pm2.5                3
RSSI                 1
PM1                  1
Name: count, dtype: int64

We see that most measurements have the unit "cpm". The notation does not have the same case, so we can substitute corresponding units that are stored in different formats (` cpm`, `CPM` and `Cpm` is the same as `cpm`).

Also replace `microsivert` with `usv` as these are the same units.

In [12]:
# substitute ` cpm`, `CPM` and `Cpm` units with `cpm`
df['Unit'] = df['Unit'].str.replace(' cpm', 'cpm')
df['Unit'] = df['Unit'].str.replace('CPM', 'cpm')
df['Unit'] = df['Unit'].str.replace('Cpm', 'cpm')

# substitute micorsivert with usv
df['Unit'] = df['Unit'].str.replace('microsievert', 'usv')

df['Unit'].value_counts()

Unit
cpm            424470
celcius         86095
status          51182
usv              5524
PM10 ug/m3        735
DeviceType2       618
DeviceType1        34
HUMD%              20
1                  15
NOXppm              8
0                   5
pm2.5               3
RSSI                1
PM1                 1
Name: count, dtype: int64

Now we will drop units which cannot be converted to radiation units or there are too few measurements for converting to make sense.

In [13]:
# Drop all row that use units with counts less than 100
df = df[df['Unit'].map(df['Unit'].value_counts()) > 100]

df['Unit'].value_counts()

Unit
cpm            424470
celcius         86095
status          51182
usv              5524
PM10 ug/m3        735
DeviceType2       618
Name: count, dtype: int64

Because measurements for the `DeviceType2` unit always have the same value and we don't know what the actual unit actually is, we can drop these measurements as well.

In [19]:
df[df['Unit'] == 'DeviceType2']['Average Value'].describe()

count    618.0
mean     130.0
std        0.0
min      130.0
25%      130.0
50%      130.0
75%      130.0
max      130.0
Name: Average Value, dtype: float64

In [21]:
df = df[df['Unit'] != 'DeviceType2']

Drop all measurements with the `Celsius` unit as well as all of them are incorrect

In [26]:
df[df['Unit'] == 'celsius']['Average Value'].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: Average Value, dtype: float64

In [27]:
df = df[df['Unit'] != 'celsius']

Let's see what we get for the `status` unit

In [28]:
df[df['Unit'] == 'status']['Average Value'].describe()

count    51182.000000
mean        11.663770
std        113.631285
min      -1000.000000
25%         20.520000
50%         25.570000
75%         29.110000
max       3149.770000
Name: Average Value, dtype: float64

### Standarize the measurement unit

We can see that location name is missing for some measurements. We will replace the `NaN` value with the `'Unknown location'` string.

In [9]:
df.loc[df['location_name'].isnull(), 'location_name'] = 'Unknown location'

KeyError: 'location_name'

We can convert all `float64` types to `float32` for faster calculations

In [None]:
df.dtypes

value            float64
unit              object
location_name     object
captured_at       object
device_id        float64
height           float64
devicetype_id     object
station_id       float64
latitude         float64
longitude        float64
dtype: object

In [7]:
to_convert = ['value', 'device_id', 'height', 'station_id', 'latitude', 'longitude']
for col in to_convert:
    df.loc[:, col] = pd.to_numeric(df[col], errors='coerce')
    
df.dtypes

value            float64
unit              object
location_name     object
captured_at       object
device_id        float64
height           float64
devicetype_id     object
station_id       float64
latitude         float64
longitude        float64
dtype: object

## Data visualization

In [8]:
fig = px.scatter_geo(
    df, 
    lat='latitude', 
    lon='longitude', 
    color='value',
    hover_name='location_name',
    title='Radiation levels',
    color_continuous_scale=['green', 'yellow', 'red', 'purple'],
)
fig.update_layout(height=800)
fig.show()