# Analysis of Safecast radiation data

In [1]:
import pandas as pd
import plotly.express as px

## Data loading

### Load the dataset

In [2]:
data = pd.read_csv('data/safecast.csv')

df = pd.DataFrame(data)
df

Unnamed: 0,value,unit,location_name,captured_at,device_id,height,devicetype_id,station_id,latitude,longitude
0,72.00,cpm,,2024-01-01T00:00:17.000Z,157.0,,,,22.284820,114.140000
1,11.15,cpm,"St Petersburg, FL, USA",2024-01-01T00:00:02.000Z,224.0,,,,27.826100,-82.628900
2,46.00,cpm,,2024-01-01T15:00:05.000Z,65001.0,,,,34.482140,136.162540
3,43.00,cpm,,2024-01-01T15:00:10.000Z,65001.0,,,,34.482150,136.162500
4,40.00,cpm,,2024-01-01T15:00:15.000Z,65001.0,,,,34.482180,136.162460
...,...,...,...,...,...,...,...,...,...,...
16315,61.00,cpm,,2024-01-01T15:00:21.000Z,,,,,59.437330,24.744827
16316,59.00,cpm,,2024-01-01T15:00:16.000Z,,,,,59.437375,24.744797
16317,56.00,cpm,,2024-01-01T15:00:11.000Z,,,,,59.437398,24.744798
16318,62.00,cpm,,2024-01-01T15:00:06.000Z,,,,,59.437410,24.744803


## Data preprocessing

Now we can drop all rows with data that has incorrect unit. Most of measurements we get use `cpm` unit, so we will remove measurements with different units.

In [3]:
df = df[df['unit'] == 'cpm']

In [4]:
df

Unnamed: 0,value,unit,location_name,captured_at,device_id,height,devicetype_id,station_id,latitude,longitude
0,72.00,cpm,,2024-01-01T00:00:17.000Z,157.0,,,,22.284820,114.140000
1,11.15,cpm,"St Petersburg, FL, USA",2024-01-01T00:00:02.000Z,224.0,,,,27.826100,-82.628900
2,46.00,cpm,,2024-01-01T15:00:05.000Z,65001.0,,,,34.482140,136.162540
3,43.00,cpm,,2024-01-01T15:00:10.000Z,65001.0,,,,34.482150,136.162500
4,40.00,cpm,,2024-01-01T15:00:15.000Z,65001.0,,,,34.482180,136.162460
...,...,...,...,...,...,...,...,...,...,...
16315,61.00,cpm,,2024-01-01T15:00:21.000Z,,,,,59.437330,24.744827
16316,59.00,cpm,,2024-01-01T15:00:16.000Z,,,,,59.437375,24.744797
16317,56.00,cpm,,2024-01-01T15:00:11.000Z,,,,,59.437398,24.744798
16318,62.00,cpm,,2024-01-01T15:00:06.000Z,,,,,59.437410,24.744803


We can see that location name is missing for some measurements. We will replace the `NaN` value with the `'Unknown location'` string.

In [5]:
df.loc[df['location_name'].isnull(), 'location_name'] = 'Unknown location'

We can convert all `float64` types to `float32` for faster calculations

In [6]:
df.dtypes

value            float64
unit              object
location_name     object
captured_at       object
device_id        float64
height           float64
devicetype_id     object
station_id       float64
latitude         float64
longitude        float64
dtype: object

In [7]:
to_convert = ['value', 'device_id', 'height', 'station_id', 'latitude', 'longitude']
for col in to_convert:
    df.loc[:, col] = pd.to_numeric(df[col], errors='coerce')
    
df.dtypes

value            float64
unit              object
location_name     object
captured_at       object
device_id        float64
height           float64
devicetype_id     object
station_id       float64
latitude         float64
longitude        float64
dtype: object

## Data visualization

In [8]:
fig = px.scatter_geo(
    df, 
    lat='latitude', 
    lon='longitude', 
    color='value',
    hover_name='location_name',
    title='Radiation levels',
    color_continuous_scale=['green', 'yellow', 'red', 'purple'],
)
fig.update_layout(height=800)
fig.show()