In [1]:
import pandas as pd
import geopandas as gpd

# Load Dataset

In [2]:
sat_data = pd.read_csv("no2_dataset.csv", parse_dates=["SatelliteDatetime", "SurfaceDatetime"])
sat_data = gpd.GeoDataFrame(sat_data, geometry=gpd.points_from_xy(sat_data.SurfaceLongitude, sat_data.SurfaceLatitude))

In [3]:
sat_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1622453 entries, 0 to 1622452
Data columns (total 23 columns):
 #   Column                              Non-Null Count    Dtype                               
---  ------                              --------------    -----                               
 0   AirQualityStation                   1622453 non-null  object                              
 1   SatelliteDatetime                   1622453 non-null  datetime64[ns]                      
 2   TroposphericNO2ColumnNumberDensity  1622453 non-null  float64                             
 3   SatelliteLongitude                  1622453 non-null  float64                             
 4   SatelliteLatitude                   1622453 non-null  float64                             
 5   Source                              1622453 non-null  object                              
 6   geometry                            1622453 non-null  geometry                            
 7   SurfaceCon

# Covered Countries
* The `Countrycode` column holds the country names as ISO 2-digit codes
* See e.g. https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes

In [4]:
sat_data.Countrycode.unique()

array(['DE', 'CZ', 'AT', 'SK', 'TR', 'IT', 'IE', 'FR', 'BE', 'HR', 'AL',
       'BG', 'EE', 'DK', 'CY', 'CH', 'FI', 'RO', 'SE', 'RS', 'SI', 'GR',
       'HU', 'LT', 'NL', 'MT', 'MK', 'LU', 'ME', 'LV', 'PL', 'GB', 'NO',
       'ES', 'PT'], dtype=object)

# Aggregate NO$_2$ Levels
* Compare surface and tropospheric NO2 levels by:
    * Season (winter (Dez., Jan., Feb.) vs. summer (Jun., Jul., Aug.))
    * Area (urban vs. rural)
    * Population density (above or below 500 people/km**2)
    * Lockdown response measures (StayHomeOrder vs. None)
    * Dominant pollution source (traffic vs. background)
    * Temperature (above or below 20°C)
    * Precipitation (above or below 10mm)

In [5]:
winter = [12, 1, 2]
summer = [6, 7, 8]
population_density_threshold = 500
temperature_threshold = 20
precipitation_threshold = 10

In [6]:
obs_all = sat_data
obs_winter = sat_data.loc[sat_data.Month.isin(winter)]
obs_summer = sat_data.loc[sat_data.Month.isin(summer)]
obs_urban = sat_data.loc[sat_data.AreaType == "urban"]
obs_rural = sat_data.loc[sat_data.AreaType == "rural"]
obs_pop_d_high = sat_data.loc[sat_data.PopulationDensity >= population_density_threshold]
obs_pop_d_low = sat_data.loc[sat_data.PopulationDensity < population_density_threshold]
obs_lockdown_yes = sat_data.loc[sat_data.Lockdown == "StayHomeOrder"]
obs_lockdown_no = sat_data.loc[sat_data.Lockdown == "None"]
obs_traffic = sat_data.loc[sat_data.StationType == "traffic"]
obs_background = sat_data.loc[sat_data.StationType == "background"]
obs_temp_high = sat_data.loc[sat_data.Temperature >= temperature_threshold]
obs_temp_low = sat_data.loc[sat_data.Temperature < temperature_threshold]
obs_prec_high = sat_data.loc[sat_data.Precipitation >= precipitation_threshold]
obs_prec_low = sat_data.loc[sat_data.Precipitation < precipitation_threshold]

In [7]:
subsets = {"all" : obs_all, "winter" : obs_winter, "summer" : obs_summer, "urban" : obs_urban, 
           "rural" : obs_rural, "high population density" : obs_pop_d_high, 
           "low population density" : obs_pop_d_low, "lockdown" : obs_lockdown_yes,
          "traffic" : obs_traffic, "background" : obs_background, "high temperature" : obs_temp_high,
          "low temperature" : obs_temp_low, "high precipitation" : obs_prec_high, "low precipitation" : obs_prec_low}
columns = [("","Observations"), ("Troposphere", "Mean"), ("Troposphere", "Std"), ("Surface", "Mean"), ("Surface", "Std"), ("","Count")]

In [8]:
# collect dataset statistics
statistics = []
for name, observations in subsets.items():
    data = [name]
    for i, measurement_type in enumerate(["TroposphericNO2ColumnNumberDensity", "SurfaceConcentration"]):
        relevant_obs = observations[measurement_type]
        data.extend([relevant_obs.mean(), relevant_obs.std()])
        if i == 1:
            data.extend([relevant_obs.shape[0]])
            
    statistics.append(data)
    
statistics = pd.DataFrame(data=statistics, columns=pd.MultiIndex.from_tuples(columns))

In [9]:
statistics

Unnamed: 0_level_0,Unnamed: 1_level_0,Troposphere,Troposphere,Surface,Surface,Unnamed: 6_level_0
Unnamed: 0_level_1,Observations,Mean,Std,Mean,Std,Count
0,all,4.5e-05,4e-05,13.747349,14.204032,1622453
1,winter,6.4e-05,6.1e-05,21.486494,18.375081,249058
2,summer,3.3e-05,2.2e-05,10.855184,12.183615,525733
3,urban,4.8e-05,4.3e-05,17.410467,15.712448,934757
4,rural,3.8e-05,3.3e-05,6.346847,7.566445,214654
5,high population density,6.6e-05,5.5e-05,19.313643,17.325999,452283
6,low population density,3.7e-05,3e-05,11.595917,12.12865,1170170
7,lockdown,4e-05,2.9e-05,9.061628,8.720799,87508
8,traffic,4.8e-05,4.4e-05,23.721059,17.788665,455139
9,background,4.4e-05,3.9e-05,9.807432,10.146342,966830


# More Details

In [10]:
sat_data.loc[:, ["AreaType", "TroposphericNO2ColumnNumberDensity", "SurfaceConcentration"]].groupby("AreaType").agg(["mean", "std", "count"])

Unnamed: 0_level_0,TroposphericNO2ColumnNumberDensity,TroposphericNO2ColumnNumberDensity,TroposphericNO2ColumnNumberDensity,SurfaceConcentration,SurfaceConcentration,SurfaceConcentration
Unnamed: 0_level_1,mean,std,count,mean,std,count
AreaType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
rural,3.8e-05,3.3e-05,214654,6.346847,7.566445,214654
rural-nearcity,4.4e-05,3.5e-05,25287,7.620031,8.114452,25287
rural-regional,3.3e-05,2.1e-05,46025,4.900556,5.708919,46025
rural-remote,2.6e-05,1.7e-05,8986,2.564842,2.647838,8986
suburban,4.4e-05,3.8e-05,392744,10.760727,11.005446,392744
urban,4.8e-05,4.3e-05,934757,17.410467,15.712448,934757


In [11]:
sat_data.loc[:, ["StationType", "TroposphericNO2ColumnNumberDensity", "SurfaceConcentration"]].groupby("StationType").agg(["mean", "std", "count"])

Unnamed: 0_level_0,TroposphericNO2ColumnNumberDensity,TroposphericNO2ColumnNumberDensity,TroposphericNO2ColumnNumberDensity,SurfaceConcentration,SurfaceConcentration,SurfaceConcentration
Unnamed: 0_level_1,mean,std,count,mean,std,count
StationType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
background,4.4e-05,3.9e-05,966830,9.807432,10.146342,966830
industrial,4.2e-05,3.7e-05,200484,10.105187,10.195276,200484
traffic,4.8e-05,4.4e-05,455139,23.721059,17.788665,455139


In [12]:
sat_data.loc[:, ["Lockdown", "TroposphericNO2ColumnNumberDensity", "SurfaceConcentration"]].groupby("Lockdown").agg(["mean", "std", "count"])

Unnamed: 0_level_0,TroposphericNO2ColumnNumberDensity,TroposphericNO2ColumnNumberDensity,TroposphericNO2ColumnNumberDensity,SurfaceConcentration,SurfaceConcentration,SurfaceConcentration
Unnamed: 0_level_1,mean,std,count,mean,std,count
Lockdown,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
,4.5e-05,4.1e-05,1412130,14.211906,14.5744,1412130
RegionalStayHomeOrder,3.1e-05,3.1e-05,1073,15.599942,14.246522,1073
RegionalStayHomeOrderPartial,6.7e-05,6.4e-05,15580,18.462677,16.444445,15580
StayHomeGen,4.8e-05,3.6e-05,27557,11.051504,10.75384,27557
StayHomeGenPartial,4.4e-05,3.5e-05,41076,10.647185,11.245395,41076
StayHomeOrder,4e-05,2.9e-05,87508,9.061628,8.720799,87508
StayHomeOrderPartial,3.3e-05,2.2e-05,20203,9.256276,10.077292,20203
StayHomeRiskG,2.5e-05,2.2e-05,15198,12.09779,11.240592,15198
StayHomeRiskGPartial,3.9e-05,3e-05,2128,11.870054,12.197437,2128
