In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../data/processed/pollution-weather.csv")
data['date-time'] = pd.to_datetime(data['date-time'])

### Współczynnik korelacji
## $r_{XY} = \dfrac{cov(X,Y)}{\sigma_X \sigma_Y}$

In [3]:
data.corr()

Unnamed: 0,pm2_5,wind-east,wind-north,temperature,pressure
pm2_5,1.0,0.216699,-0.122668,-0.330747,0.130883
wind-east,0.216699,1.0,-0.236909,0.018249,0.051069
wind-north,-0.122668,-0.236909,1.0,-0.050478,0.108393
temperature,-0.330747,0.018249,-0.050478,1.0,-0.035673
pressure,0.130883,0.051069,0.108393,-0.035673,1.0


In [4]:
# Grupowanie danych - średnia względem dni
grouped = data.groupby(pd.Grouper(key='date-time', axis=0,
                                  freq='D')).mean()
grouped

Unnamed: 0_level_0,pm2_5,wind-east,wind-north,temperature,pressure
date-time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-01,32.488333,0.082155,-1.431997,-0.125000,997.966667
2021-01-02,52.236250,1.364717,-0.704986,0.650000,1003.800000
2021-01-03,23.772083,3.315934,1.621786,1.487500,1003.950000
2021-01-04,34.702500,3.145808,1.146105,2.800000,1003.916667
2021-01-05,22.851667,3.702931,2.338863,3.087500,998.779167
...,...,...,...,...,...
2021-12-27,54.067500,3.680998,-0.518715,-9.870833,1004.283333
2021-12-28,28.182500,5.743422,-2.431569,-5.545833,998.512500
2021-12-29,40.446667,3.695867,-1.874503,-2.466667,995.537500
2021-12-30,42.038333,-0.206690,-1.164237,2.837500,999.454167


### Współczynniki korelacji - dane pogrupowane

In [5]:
grouped.corr()

Unnamed: 0,pm2_5,wind-east,wind-north,temperature,pressure
pm2_5,1.0,0.256259,-0.166598,-0.357743,0.153946
wind-east,0.256259,1.0,-0.281239,0.043123,0.061173
wind-north,-0.166598,-0.281239,1.0,-0.041831,0.124447
temperature,-0.357743,0.043123,-0.041831,1.0,-0.019376
pressure,0.153946,0.061173,0.124447,-0.019376,1.0


### Współczynnik informacji wzajemnej
## $I(X;Y) = \sum_y \sum_x P_{(X,Y)} (x, y)  log \left( \dfrac{P_{(X,Y)} (x,y)}{P_X (x) P_Y (y)} \right)$
https://en.wikipedia.org/wiki/Mutual_information

In [6]:
from sklearn.feature_selection import mutual_info_regression

no_null = data.dropna()
mutual_info_regression(no_null[['wind-east', 'wind-north', 'temperature', 'pressure']], no_null['pm2_5'], n_neighbors=3)

array([0.0726591 , 0.05063169, 0.12908659, 0.1192571 ])

In [7]:
mutual_info_regression(grouped[['wind-east', 'wind-north', 'temperature', 'pressure']], grouped['pm2_5'], n_neighbors=3)

array([0.11989735, 0.0450831 , 0.09640267, 0.04470842])