In [4]:
import pandas as pd

FILE_PATHS = ["stations_2012_2021_NH3_NO2_PM10_PM25_formatted/NL10131_2012_2021.csv",
              "stations_2012_2021_NH3_NO2_PM10_PM25_formatted/NL10444_2012_2021.csv",
              "stations_2012_2021_NH3_NO2_PM10_PM25_formatted/NL10538_2012_2021.csv",
              "stations_2012_2021_NH3_NO2_PM10_PM25_formatted/NL10633_2012_2021.csv",
              "stations_2012_2021_NH3_NO2_PM10_PM25_formatted/NL10738_2012_2021.csv",
              "stations_2012_2021_NH3_NO2_PM10_PM25_formatted/NL10929_2012_2021.csv"]

STATION_NAMES = ["NL10131",
                 "NL10444",
                 "NL10538",
                 "NL10633",
                 "NL10738",
                 "NL10929"]

# read the data from the csv files
data = [pd.read_csv(FILE_PATHS[i]) for i in range(len(FILE_PATHS))]

# add all relevant columns to the data
for j in range(len(data)):
    data[j]["date_time"] = pd.to_datetime(data[j]["Begindatumtijd"])
    data[j]["day_of_week"] = data[j]["date_time"].dt.day_of_week
    data[j]["day_of_year"] = data[j]["date_time"].dt.day_of_year
    data[j]["year"] = data[j]["date_time"].dt.year
    data[j]["hour"] = data[j]["date_time"].dt.hour

# fill NaN values by interpolation
for k in range(len(data)):
    data[k]['NH3'] = data[k]['NH3'].interpolate(method='linear', limit_direction='both')

data[0]

Unnamed: 0,Begindatumtijd,NH3,NO2,PM10,Lat,Lon,Station,date_time,day_of_week,day_of_year,year,hour
0,2012/01/01 00:00,5.160,8.33,26.810,51.54052,5.85307,NL10131,2012-01-01 00:00:00,6,1,2012,0
1,2012/01/01 01:00,5.160,7.89,23.558,51.54052,5.85307,NL10131,2012-01-01 01:00:00,6,1,2012,1
2,2012/01/01 02:00,5.160,6.03,18.314,51.54052,5.85307,NL10131,2012-01-01 02:00:00,6,1,2012,2
3,2012/01/01 03:00,5.160,4.81,7.070,51.54052,5.85307,NL10131,2012-01-01 03:00:00,6,1,2012,3
4,2012/01/01 04:00,5.160,4.53,10.070,51.54052,5.85307,NL10131,2012-01-01 04:00:00,6,1,2012,4
...,...,...,...,...,...,...,...,...,...,...,...,...
87667,2021/12/31 19:00,4.123,7.62,13.390,51.54052,5.85307,NL10131,2021-12-31 19:00:00,4,365,2021,19
87668,2021/12/31 20:00,3.795,7.46,13.390,51.54052,5.85307,NL10131,2021-12-31 20:00:00,4,365,2021,20
87669,2021/12/31 21:00,4.022,7.07,18.510,51.54052,5.85307,NL10131,2021-12-31 21:00:00,4,365,2021,21
87670,2021/12/31 22:00,5.088,7.18,13.390,51.54052,5.85307,NL10131,2021-12-31 22:00:00,4,365,2021,22


In [6]:
GROUP1_DAYS = [1,2,3,4,5]
GROUP2_DAYS = [6,0]

WEEKDAYS = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday"
]

group1 = list(pd.DataFrame() for _ in range(len(data)))
group2 = list(pd.DataFrame() for _ in range(len(data)))

for a in range(len(data)):
    group1[a] = data[a][data[a]['day_of_week'].isin(GROUP1_DAYS)]
    group2[a] = data[a][data[a]['day_of_week'].isin(GROUP2_DAYS)]

In [None]:
import scipy.stats as stats

print("---------- Mann Whitney U-test ----------")

group1_names = ""
group2_names = ""
for c in GROUP1_DAYS:
  group1_names += WEEKDAYS[c] + " "
for d in GROUP2_DAYS:
  group2_names += WEEKDAYS[d] + " "
print(group1_names, "//", group2_names, "\n")

print("(p-values below 0.05 indicate a significant difference between \nthe normal distributions can be assumed)", "\n")

print("Station", "p-value","\t\t", "difference")
for b in range(len(data)):
    res = stats.mannwhitneyu(group1[b]['NH3'], group2[b]['NH3'])
    print(STATION_NAMES[b], res.pvalue, "\t", True if res.pvalue < 0.05 else False)