In [5]:
import pandas as pd
import numpy as np
import json
import os
import time

## Read in Data

In [2]:
data_dir = "C:/Users/mhoyl/STAT480/FinalProject/sofia_sensors"

In [3]:
analysis_files = [file for file in os.listdir(data_dir) if file.endswith("_sds011sof.csv")][12:]
analysis_files

['2018-07_sds011sof.csv',
 '2018-08_sds011sof.csv',
 '2018-09_sds011sof.csv',
 '2018-10_sds011sof.csv',
 '2018-11_sds011sof.csv',
 '2018-12_sds011sof.csv',
 '2019-01_sds011sof.csv',
 '2019-02_sds011sof.csv',
 '2019-03_sds011sof.csv',
 '2019-04_sds011sof.csv',
 '2019-05_sds011sof.csv',
 '2019-06_sds011sof.csv']

In [4]:
start_time = time.time()

sofia_dfs = []
for file in analysis_files:
    sofia_dfs.append(pd.read_csv(f"{data_dir}/{file}").drop("Unnamed: 0", axis = 1))
    
print(round((time.time() - start_time)/60, 3), " min")

0.788  min


In [5]:
sofia_dataset = pd.concat(sofia_dfs, axis = 0)

In [6]:
del sofia_dfs

In [7]:
sofia_dataset.head(10)

Unnamed: 0,sensor_id,location,lat,lon,timestamp,P1,P2
0,1020,499,42.647,23.27,2018-07-01T00:00:00,13.47,12.47
1,11560,5836,42.677,23.264,2018-07-01T00:00:00,7.4,6.7
2,11098,5603,42.65,23.364,2018-07-01T00:00:00,7.87,7.17
3,1953,977,42.704,23.354,2018-07-01T00:00:01,7.73,7.1
4,3511,1770,42.683,23.335,2018-07-01T00:00:02,10.53,9.63
5,6344,3205,42.667,23.264,2018-07-01T00:00:02,11.97,8.27
6,9675,4876,42.656,23.292,2018-07-01T00:00:02,7.9,7.2
7,6043,3050,42.661,23.277,2018-07-01T00:00:02,8.97,8.27
8,11675,5894,42.769,23.405,2018-07-01T00:00:03,11.47,10.27
9,5295,2670,42.721,23.336,2018-07-01T00:00:03,11.7,9.13


In [8]:
sofia_dataset

Unnamed: 0,sensor_id,location,lat,lon,timestamp,P1,P2
0,1020,499,42.647,23.270,2018-07-01T00:00:00,13.47,12.47
1,11560,5836,42.677,23.264,2018-07-01T00:00:00,7.40,6.70
2,11098,5603,42.650,23.364,2018-07-01T00:00:00,7.87,7.17
3,1953,977,42.704,23.354,2018-07-01T00:00:01,7.73,7.10
4,3511,1770,42.683,23.335,2018-07-01T00:00:02,10.53,9.63
...,...,...,...,...,...,...,...
5904994,22646,11490,42.726,23.268,2019-06-30T23:59:58,7.92,4.65
5904995,6731,3403,42.674,23.256,2019-06-30T23:59:58,8.63,4.67
5904996,23163,11750,42.692,23.362,2019-06-30T23:59:59,11.40,5.13
5904997,2039,1025,42.686,23.350,2019-06-30T23:59:59,5.37,4.87


## Sensor measurement about every 2.5 minutes

In [9]:
sub1020 = sofia_dataset[sofia_dataset['sensor_id'] == 11098]

In [10]:
(pd.DatetimeIndex(sub1020['timestamp'][331:332]) - pd.DatetimeIndex(sub1020['timestamp'][330:331]))

TimedeltaIndex(['0 days 00:02:28'], dtype='timedelta64[ns]', name='timestamp', freq=None)

## We're going to remove all missing values and values above 97.5th percentile in P2 (PM2.5):
### (Removing extreme high values, not low values because 0 is technically possible)

In [11]:
sofdata = sofia_dataset.copy()

In [12]:
# percent missing is quite small
np.mean(np.isnan(sofdata['P2']))

0.00023400362540282836

In [13]:
sofdata = sofdata[np.isnan(sofdata['P2']) == False]

In [14]:
p1_975 = np.quantile(sofdata['P2'], 0.975)
p1_975

80.67

In [15]:
sofdata = sofdata[sofdata['P2'] <= p1_975].reset_index(drop = True)
sofdata

Unnamed: 0,sensor_id,location,lat,lon,timestamp,P1,P2
0,1020,499,42.647,23.270,2018-07-01T00:00:00,13.47,12.47
1,11560,5836,42.677,23.264,2018-07-01T00:00:00,7.40,6.70
2,11098,5603,42.650,23.364,2018-07-01T00:00:00,7.87,7.17
3,1953,977,42.704,23.354,2018-07-01T00:00:01,7.73,7.10
4,3511,1770,42.683,23.335,2018-07-01T00:00:02,10.53,9.63
...,...,...,...,...,...,...,...
60430780,22646,11490,42.726,23.268,2019-06-30T23:59:58,7.92,4.65
60430781,6731,3403,42.674,23.256,2019-06-30T23:59:58,8.63,4.67
60430782,23163,11750,42.692,23.362,2019-06-30T23:59:59,11.40,5.13
60430783,2039,1025,42.686,23.350,2019-06-30T23:59:59,5.37,4.87


## Group by sensor and date

In [16]:
sofdata['date'] = pd.DatetimeIndex(sofdata['timestamp']).date

In [17]:
sofdata_grouped = sofdata.groupby(['sensor_id', 'date']).mean().reset_index()

In [18]:
sofdata_grouped

Unnamed: 0,sensor_id,date,location,lat,lon,P1,P2
0,739,2018-09-21,354.0,42.694,23.337,8.009815,4.999136
1,739,2018-09-22,354.0,42.694,23.337,15.673511,10.291997
2,739,2018-09-23,354.0,42.694,23.337,26.578480,18.875699
3,739,2018-09-24,354.0,42.694,23.337,18.994805,14.145674
4,739,2018-09-25,354.0,42.694,23.337,6.395000,5.132836
...,...,...,...,...,...,...,...
115458,28130,2019-06-27,15418.0,42.634,23.374,16.595138,9.271741
115459,28130,2019-06-28,15418.0,42.634,23.374,19.338834,11.369863
115460,28130,2019-06-29,15418.0,42.634,23.374,10.872718,5.135366
115461,28130,2019-06-30,15418.0,42.634,23.374,8.674697,4.358009


## Read in sensor-district dictionary

In [19]:
with open('SofiaSensorDistricts.json', 'r') as j:
     SensorDistrictDict = json.loads(j.read())

#### We create a district column and match each sensor to it's district

In [20]:
sofdata_grouped['district'] = sofdata_grouped['sensor_id'].astype(str)

In [21]:
sofdata_grouped

Unnamed: 0,sensor_id,date,location,lat,lon,P1,P2,district
0,739,2018-09-21,354.0,42.694,23.337,8.009815,4.999136,739
1,739,2018-09-22,354.0,42.694,23.337,15.673511,10.291997,739
2,739,2018-09-23,354.0,42.694,23.337,26.578480,18.875699,739
3,739,2018-09-24,354.0,42.694,23.337,18.994805,14.145674,739
4,739,2018-09-25,354.0,42.694,23.337,6.395000,5.132836,739
...,...,...,...,...,...,...,...,...
115458,28130,2019-06-27,15418.0,42.634,23.374,16.595138,9.271741,28130
115459,28130,2019-06-28,15418.0,42.634,23.374,19.338834,11.369863,28130
115460,28130,2019-06-29,15418.0,42.634,23.374,10.872718,5.135366,28130
115461,28130,2019-06-30,15418.0,42.634,23.374,8.674697,4.358009,28130


In [22]:
sofdata_grouped['district'] = sofdata_grouped['district'].replace(SensorDistrictDict)

#### Confirm all sensors were assigned districts

In [23]:
sofdata_grouped.groupby('district').size().sort_values()

district
Kremikovetsi        329
Ilinden            1040
Novi Iskar         1280
Sredets            2350
Vrabnits           2732
Vazrazhdane        2822
Iskar              2976
Nadezhda           3056
Izgrev             3329
Lyulin             3432
Studentski         3570
Serdika            3699
Oborishte          3919
Krasna Polyana     4356
Poduyane           4554
Pancharevo         5918
Krasno selo        6833
Lozenets           6990
Triaditsa          7331
Slatina            7997
Ovcha kupel        8197
Mladost           13087
Vitosha           15666
dtype: int64

In [24]:
sofdata_grouped = sofdata_grouped.rename(columns = {'P1' : 'PM10', 'P2' : 'PM2.5'})

In [25]:
sofdata_grouped

Unnamed: 0,sensor_id,date,location,lat,lon,PM10,PM2.5,district
0,739,2018-09-21,354.0,42.694,23.337,8.009815,4.999136,Sredets
1,739,2018-09-22,354.0,42.694,23.337,15.673511,10.291997,Sredets
2,739,2018-09-23,354.0,42.694,23.337,26.578480,18.875699,Sredets
3,739,2018-09-24,354.0,42.694,23.337,18.994805,14.145674,Sredets
4,739,2018-09-25,354.0,42.694,23.337,6.395000,5.132836,Sredets
...,...,...,...,...,...,...,...,...
115458,28130,2019-06-27,15418.0,42.634,23.374,16.595138,9.271741,Mladost
115459,28130,2019-06-28,15418.0,42.634,23.374,19.338834,11.369863,Mladost
115460,28130,2019-06-29,15418.0,42.634,23.374,10.872718,5.135366,Mladost
115461,28130,2019-06-30,15418.0,42.634,23.374,8.674697,4.358009,Mladost


### Create District Groups

In [44]:
sofdata_grouped = pd.read_csv("SofiaSensorDataCleaned.csv")

In [45]:
list(sofdata_grouped['district'].unique())

['Sredets',
 'Mladost',
 'Serdika',
 'Vazrazhdane',
 'Iskar',
 'Vitosha',
 'Lozenets',
 'Krasno selo',
 'Pancharevo',
 'Izgrev',
 'Studentski',
 'Poduyane',
 'Slatina',
 'Triaditsa',
 'Ilinden',
 'Oborishte',
 'Kremikovetsi',
 'Ovcha kupel',
 'Lyulin',
 'Krasna Polyana',
 'Vrabnits',
 'Nadezhda',
 'Novi Iskar']

In [35]:
dist_groups = {
    'Sredets' : 'City Center',
    'Vazrazhdane' : 'City Center',
    'Oborishte' : 'City Center',
    'Krasno selo' : 'Early to Mid 20th Century Districts',
    'Serdika' : 'Early to Mid 20th Century Districts',
    'Poduyane' : 'Early to Mid 20th Century Districts',
    'Slatina' : 'Early to Mid 20th Century Districts',
    'Izgrev' : 'Early to Mid 20th Century Districts',
    'Lozenets' : 'Early to Mid 20th Century Districts',
    'Triaditsa' : 'Early to Mid 20th Century Districts',
    'Krasna Polyana' : 'Early to Mid 20th Century Districts',
    'Ilinden' : 'Early to Mid 20th Century Districts',
    'Nadezhda' : 'Socialist Housing Districts',
    'Iskar' : 'Socialist Housing Districts',
    'Mladost' : 'Socialist Housing Districts',
    'Studentski' : 'Socialist Housing Districts',
    'Lyulin' : 'Socialist Housing Districts',
    'Kremikovetsi' : 'Industrial District',
    'Vitosha' : 'Peripheral Districts',
    'Ovcha kupel' : 'Peripheral Districts',
    'Vrabnits' : 'Peripheral Districts',
    'Novi Iskar' : 'Peripheral Districts',
    'Pancharevo' : 'Peripheral Districts',
    'Bankya' : 'Peripheral Districts'
}

In [47]:
# confirm only Bankya is missing
[distr for distr in list(dist_groups.keys()) if distr not in list(sofdata_grouped['district'].unique())]

['Bankya']

In [48]:
sofdata_grouped['district_group'] = sofdata_grouped['district']

In [49]:
sofdata_grouped['district_group'] = sofdata_grouped['district_group'].replace(dist_groups)

In [50]:
sofdata_grouped

Unnamed: 0,sensor_id,date,location,lat,lon,PM10,PM2.5,district,district_group
0,739,2018-09-21,354.0,42.694,23.337,8.009815,4.999136,Sredets,City Center
1,739,2018-09-22,354.0,42.694,23.337,15.673511,10.291997,Sredets,City Center
2,739,2018-09-23,354.0,42.694,23.337,26.578480,18.875699,Sredets,City Center
3,739,2018-09-24,354.0,42.694,23.337,18.994805,14.145674,Sredets,City Center
4,739,2018-09-25,354.0,42.694,23.337,6.395000,5.132836,Sredets,City Center
...,...,...,...,...,...,...,...,...,...
115458,28130,2019-06-27,15418.0,42.634,23.374,16.595138,9.271741,Mladost,Socialist Housing Districts
115459,28130,2019-06-28,15418.0,42.634,23.374,19.338834,11.369863,Mladost,Socialist Housing Districts
115460,28130,2019-06-29,15418.0,42.634,23.374,10.872718,5.135366,Mladost,Socialist Housing Districts
115461,28130,2019-06-30,15418.0,42.634,23.374,8.674697,4.358009,Mladost,Socialist Housing Districts


In [51]:
sofdata_grouped.to_csv("SofiaSensorDataCleaned.csv", index = False)