In [312]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

In [313]:
df = pd.read_csv('AirQuality.csv', sep=';')
#drop rows with all NaN
df = df.dropna(how='all')
df = df[["Time", "CO(GT)", "T", "RH", "AH"]]
df.rename(columns={"CO(GT)": "CO(mg/m^3)", "T": "Temperature(°C)", "RH": "Relative Humidity(%)", "AH": "Absolute Humidity(g/m^3)"}, inplace=True)
df["Time"] = df["Time"].str.split(".").str[0].astype(int)
df["CO(mg/m^3)"] = df["CO(mg/m^3)"].str.replace(",", ".").astype(float)
df["Temperature(°C)"] = df["Temperature(°C)"].str.replace(",", ".").astype(float)
df["Relative Humidity(%)"] = df["Relative Humidity(%)"].str.replace(",", ".").astype(float)
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"].str.replace(",", ".").astype(float)
# replace all -200 with NaN on the CO(GT) column
df["CO(mg/m^3)"] = df["CO(mg/m^3)"] .replace(-200, np.nan)
df["Temperature(°C)"] = df["Temperature(°C)"] .replace(-200, np.nan)
df["Relative Humidity(%)"] = df["Relative Humidity(%)"] .replace(-200, np.nan)
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"] .replace(-200, np.nan)
# fill NaN with the average of previous and next value
df["CO(mg/m^3)"] = df["CO(mg/m^3)"].fillna((df["CO(mg/m^3)"].fillna(method='ffill') + df["CO(mg/m^3)"].fillna(method='bfill'))/2)
df["Temperature(°C)"] = df["Temperature(°C)"].fillna((df["Temperature(°C)"].fillna(method='ffill') + df["Temperature(°C)"].fillna(method='bfill'))/2)
df["Relative Humidity(%)"] = df["Relative Humidity(%)"].fillna((df["Relative Humidity(%)"].fillna(method='ffill') + df["Relative Humidity(%)"].fillna(method='bfill'))/2)
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"].fillna((df["Absolute Humidity(g/m^3)"].fillna(method='ffill') + df["Absolute Humidity(g/m^3)"].fillna(method='bfill'))/2)
# add delta column
df["Delta CO(mg/m^3)"] = df["CO(mg/m^3)"].diff()
# drop first row
df = df.dropna()
time = df["Time"].values
df


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
1,19,2.0,13.3,47.7,0.7255,-0.6
2,20,2.2,11.9,54.0,0.7502,0.2
3,21,2.2,11.0,60.0,0.7867,0.0
4,22,1.6,11.2,59.6,0.7888,-0.6
5,23,1.2,11.2,59.2,0.7848,-0.4
...,...,...,...,...,...,...
9352,10,3.1,21.9,29.3,0.7568,-0.8
9353,11,2.4,24.3,23.7,0.7119,-0.7
9354,12,2.4,26.9,18.3,0.6406,0.0
9355,13,2.1,28.3,13.5,0.5139,-0.3


In [314]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9356 entries, 1 to 9356
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Time                      9356 non-null   int64  
 1   CO(mg/m^3)                9356 non-null   float64
 2   Temperature(°C)           9356 non-null   float64
 3   Relative Humidity(%)      9356 non-null   float64
 4   Absolute Humidity(g/m^3)  9356 non-null   float64
 5   Delta CO(mg/m^3)          9356 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 511.7 KB


In [315]:
# count of NaN values
df.isna().sum()

Time                        0
CO(mg/m^3)                  0
Temperature(°C)             0
Relative Humidity(%)        0
Absolute Humidity(g/m^3)    0
Delta CO(mg/m^3)            0
dtype: int64

In [316]:
# count nan of the column CO(mg/m^3)
df["CO(mg/m^3)"].isna().sum()

0

In [317]:
df.describe()

Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
count,9356.0,9356.0,9356.0,9356.0,9356.0,9356.0
mean,11.497862,2.130553,18.233903,49.191417,1.019649,-4.3e-05
std,6.923225,1.413028,8.773972,17.151238,0.402093,0.747343
min,0.0,0.1,-1.9,9.2,0.1847,-5.1
25%,5.0,1.1,11.6,35.7,0.7326,-0.3
50%,11.0,1.8,17.6,49.7,0.9875,0.0
75%,17.25,2.95,24.3,62.2,1.30685,0.2
max,23.0,11.9,44.6,88.7,2.231,4.6


In [318]:
# normalize all the columns to be between 0 and 23 using min-max normalization (sklearn MinMaxScaler)
scaler = MinMaxScaler()
df_normalized = df.copy()
df_normalized = scaler.fit_transform(df)
df_normalized = pd.DataFrame(df_normalized, columns=df.columns)
df_normalized["Time"] = df["Time"].astype(str)
df_normalized

Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
0,,0.161017,0.326882,0.484277,0.264282,0.463918
1,19,0.177966,0.296774,0.563522,0.276352,0.546392
2,20,0.177966,0.277419,0.638994,0.294190,0.525773
3,21,0.127119,0.281720,0.633962,0.295216,0.463918
4,22,0.093220,0.281720,0.628931,0.293261,0.484536
...,...,...,...,...,...,...
9351,9,0.254237,0.511828,0.252830,0.279578,0.443299
9352,10,0.194915,0.563441,0.182390,0.257636,0.453608
9353,11,0.194915,0.619355,0.114465,0.222792,0.525773
9354,12,0.169492,0.649462,0.054088,0.160876,0.494845


In [319]:
# plot all the curves with plotly
fig = px.line(df_normalized.iloc[:24], x="Time", y=["CO(mg/m^3)", "Delta CO(mg/m^3)"], markers=True, title="Normalized Air Quality Data")
fig.show()

In [320]:
# get dummy variables for time
df_dummy = pd.get_dummies(df, columns=["Time"], dtype=int)

In [321]:
corr = df_dummy.drop(columns=['Temperature(°C)', 'Relative Humidity(%)','Absolute Humidity(g/m^3)', 'CO(mg/m^3)']).corr(method='pearson')
time_deltaCO = corr.iloc[1:, 0].sort_values(ascending=False)
time_deltaCO

Time_8     0.242601
Time_7     0.204567
Time_18    0.149792
Time_17    0.128226
Time_19    0.070105
Time_6     0.048816
Time_16    0.048016
Time_9     0.036114
Time_13    0.009351
Time_5    -0.013119
Time_14   -0.017520
Time_15   -0.017972
Time_12   -0.020669
Time_0    -0.021528
Time_23   -0.024676
Time_4    -0.025320
Time_3    -0.050545
Time_20   -0.062496
Time_11   -0.072192
Time_1    -0.076593
Time_2    -0.084787
Time_10   -0.093517
Time_22   -0.148869
Time_21   -0.207405
Name: Delta CO(mg/m^3), dtype: float64

In [322]:
dict_correlations = time_deltaCO.to_dict()
dict_correlations_adjusted = {}
for key in dict_correlations:
    # take only the number of the time
    dict_correlations_adjusted[int(key.split("_")[1])] = dict_correlations[key]
dict_correlations_adjusted

{8: 0.24260088825085593,
 7: 0.2045666610068122,
 18: 0.14979224655126702,
 17: 0.12822617166505484,
 19: 0.07010511257490729,
 6: 0.04881596374122353,
 16: 0.04801613566799656,
 9: 0.03611403460516024,
 13: 0.0093505332142435,
 5: -0.01311935832786309,
 14: -0.017520308422893562,
 15: -0.017971754598453696,
 12: -0.020668955645354318,
 0: -0.021527677615116397,
 23: -0.02467632483757722,
 4: -0.02532036631489877,
 3: -0.050545324176658406,
 20: -0.06249587158917993,
 11: -0.07219227383107672,
 1: -0.07659322392610687,
 2: -0.08478686272091955,
 10: -0.09351720274683396,
 22: -0.1488689897144115,
 21: -0.2074052039865224}

In [323]:
# count number of positive and negative correlations
positive_correlations = 0
negative_correlations = 0
for key in dict_correlations_adjusted:
    if dict_correlations_adjusted[key] > 0:
        positive_correlations += 1
    else:
        negative_correlations += 1
positive_correlations, negative_correlations

(9, 15)

In [324]:
# convert to integers (persons)
pos_count = positive_correlations
#neg_count = negative_correlations -1
for key in dict_correlations_adjusted:
    if dict_correlations_adjusted[key] > 0:
        dict_correlations_adjusted[key] = pos_count
        pos_count -= 1
#    else:
#        dict_correlations_adjusted[key] = (negative_correlations - neg_count) * -1
#        negative_correlations += 1
dict_correlations_adjusted

{8: 9,
 7: 8,
 18: 7,
 17: 6,
 19: 5,
 6: 4,
 16: 3,
 9: 2,
 13: 1,
 5: -0.01311935832786309,
 14: -0.017520308422893562,
 15: -0.017971754598453696,
 12: -0.020668955645354318,
 0: -0.021527677615116397,
 23: -0.02467632483757722,
 4: -0.02532036631489877,
 3: -0.050545324176658406,
 20: -0.06249587158917993,
 11: -0.07219227383107672,
 1: -0.07659322392610687,
 2: -0.08478686272091955,
 10: -0.09351720274683396,
 22: -0.1488689897144115,
 21: -0.2074052039865224}

In [325]:
fig = px.line(time_deltaCO.values, markers=True, title="Correlation between Time and Delta CO(mg/m^3)")
fig.show()

In [326]:
# plot the distribution of the CO(mg/m^3) column and the Delta CO(mg/m^3) column
fig = px.histogram(df, x="CO(mg/m^3)", title="CO(mg/m^3) distribution")
fig.show()

In [327]:
fig = px.histogram(df, x="Delta CO(mg/m^3)", title="Delta CO(mg/m^3) distribution")
fig.show()

In [328]:
# map dict_correlations_adjusted to df column "Time"
df["Time"] = df["Time"].map(dict_correlations_adjusted)
df

Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
1,5.000000,2.0,13.3,47.7,0.7255,-0.6
2,-0.062496,2.2,11.9,54.0,0.7502,0.2
3,-0.207405,2.2,11.0,60.0,0.7867,0.0
4,-0.148869,1.6,11.2,59.6,0.7888,-0.6
5,-0.024676,1.2,11.2,59.2,0.7848,-0.4
...,...,...,...,...,...,...
9352,-0.093517,3.1,21.9,29.3,0.7568,-0.8
9353,-0.072192,2.4,24.3,23.7,0.7119,-0.7
9354,-0.020669,2.4,26.9,18.3,0.6406,0.0
9355,1.000000,2.1,28.3,13.5,0.5139,-0.3


In [329]:
# create two new columns, one with the only positive values of Time and the other with the only negative values of Time, and drop the original Time column
df["People"] = df["Time"].apply(lambda x: x if x > 0 else 0)
df["Suction Pump"] = df["Time"].apply(lambda x: x if x < 0 else 0) * -1
df = df.drop(columns=["Time"])
df

Unnamed: 0,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3),People,Suction Pump
1,2.0,13.3,47.7,0.7255,-0.6,5.0,-0.000000
2,2.2,11.9,54.0,0.7502,0.2,0.0,0.062496
3,2.2,11.0,60.0,0.7867,0.0,0.0,0.207405
4,1.6,11.2,59.6,0.7888,-0.6,0.0,0.148869
5,1.2,11.2,59.2,0.7848,-0.4,0.0,0.024676
...,...,...,...,...,...,...,...
9352,3.1,21.9,29.3,0.7568,-0.8,0.0,0.093517
9353,2.4,24.3,23.7,0.7119,-0.7,0.0,0.072192
9354,2.4,26.9,18.3,0.6406,0.0,0.0,0.020669
9355,2.1,28.3,13.5,0.5139,-0.3,1.0,-0.000000


In [330]:
# add moving average column of Temperature(°C) and Relative Humidity(%) and Absolute Humidity(g/m^3)
df["Temperature(°C)"] = df["Temperature(°C)"].rolling(window=24).mean()
df["Relative Humidity(%)"] = df["Relative Humidity(%)"].rolling(window=24).mean()
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"].rolling(window=24).mean()
df = df.dropna()
df

Unnamed: 0,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3),People,Suction Pump
24,4.8,10.412500,62.208333,0.779629,1.9,7.0,-0.000000
25,6.9,10.262500,63.108333,0.784062,2.1,5.0,-0.000000
26,6.1,10.166667,63.683333,0.786692,-0.8,0.0,0.062496
27,3.9,10.087500,63.850000,0.784825,-2.2,0.0,0.207405
28,1.5,9.962500,64.008333,0.780729,-2.4,0.0,0.148869
...,...,...,...,...,...,...,...
9352,3.1,18.700000,37.941667,0.694262,-0.8,0.0,0.093517
9353,2.4,18.712500,38.125000,0.700287,-0.7,0.0,0.072192
9354,2.4,18.729167,38.204167,0.703671,0.0,0.0,0.020669
9355,2.1,18.712500,38.195833,0.702992,-0.3,1.0,-0.000000


In [331]:
df = df[['CO(mg/m^3)', 'Delta CO(mg/m^3)', 'People', 'Suction Pump',
            'Temperature(°C)', 'Relative Humidity(%)', 'Absolute Humidity(g/m^3)']]
df

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
24,4.8,1.9,7.0,-0.000000,10.412500,62.208333,0.779629
25,6.9,2.1,5.0,-0.000000,10.262500,63.108333,0.784062
26,6.1,-0.8,0.0,0.062496,10.166667,63.683333,0.786692
27,3.9,-2.2,0.0,0.207405,10.087500,63.850000,0.784825
28,1.5,-2.4,0.0,0.148869,9.962500,64.008333,0.780729
...,...,...,...,...,...,...,...
9352,3.1,-0.8,0.0,0.093517,18.700000,37.941667,0.694262
9353,2.4,-0.7,0.0,0.072192,18.712500,38.125000,0.700287
9354,2.4,0.0,0.0,0.020669,18.729167,38.204167,0.703671
9355,2.1,-0.3,1.0,-0.000000,18.712500,38.195833,0.702992


In [332]:
corr = df.corr(method='pearson')
corr

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
CO(mg/m^3),1.0,0.264423,0.196365,-0.023321,-0.089741,0.196593,0.008597
Delta CO(mg/m^3),0.264423,1.0,0.442992,-0.390275,-0.002147,-0.007466,-0.005922
People,0.196365,0.442992,1.0,-0.486616,-6e-06,9.1e-05,7.5e-05
Suction Pump,-0.023321,-0.390275,-0.486616,1.0,-0.000201,0.000196,-0.000133
Temperature(°C),-0.089741,-0.002147,-6e-06,-0.000201,1.0,-0.468969,0.792436
Relative Humidity(%),0.196593,-0.007466,9.1e-05,0.000196,-0.468969,1.0,0.111199
Absolute Humidity(g/m^3),0.008597,-0.005922,7.5e-05,-0.000133,0.792436,0.111199,1.0


In [333]:
# normalize all the columns to be between 0 and 23 using min-max normalization (sklearn MinMaxScaler)
scaler = MinMaxScaler()
df_normalized_2 = df.copy()
df_normalized_2 = scaler.fit_transform(df)
df_normalized_2 = pd.DataFrame(df_normalized_2, columns=df.columns)
df_normalized_2["Delta CO(mg/m^3)"] = df_normalized_2["CO(mg/m^3)"].diff()
df_normalized_2

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
0,0.398305,,0.777778,0.000000,0.291472,0.696824,0.306522
1,0.576271,0.177966,0.555556,0.000000,0.286807,0.711236,0.308999
2,0.508475,-0.067797,0.000000,0.301323,0.283826,0.720443,0.310468
3,0.322034,-0.186441,0.000000,1.000000,0.281363,0.723112,0.309425
4,0.118644,-0.203390,0.000000,0.717769,0.277475,0.725647,0.307136
...,...,...,...,...,...,...,...
9328,0.254237,-0.067797,0.000000,0.450891,0.549248,0.308247,0.258817
9329,0.194915,-0.059322,0.000000,0.348074,0.549637,0.311182,0.262184
9330,0.194915,0.000000,0.000000,0.099655,0.550156,0.312450,0.264075
9331,0.169492,-0.025424,0.111111,0.000000,0.549637,0.312317,0.263695


In [334]:
# plot all the curves with plotly
x= 0
fig = px.line(df_normalized_2.iloc[x:x+48], #x="Time", y=["CO(mg/m^3)", "Delta CO(mg/m^3)", ' People', 'Suction Pump'],
              markers=True, title="Normalized 2 Air Quality Data")
fig.show()

In [336]:
# restart the index from 0
df = df.reset_index(drop=True)
df

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
0,4.8,1.9,7.0,-0.000000,10.412500,62.208333,0.779629
1,6.9,2.1,5.0,-0.000000,10.262500,63.108333,0.784062
2,6.1,-0.8,0.0,0.062496,10.166667,63.683333,0.786692
3,3.9,-2.2,0.0,0.207405,10.087500,63.850000,0.784825
4,1.5,-2.4,0.0,0.148869,9.962500,64.008333,0.780729
...,...,...,...,...,...,...,...
9328,3.1,-0.8,0.0,0.093517,18.700000,37.941667,0.694262
9329,2.4,-0.7,0.0,0.072192,18.712500,38.125000,0.700287
9330,2.4,0.0,0.0,0.020669,18.729167,38.204167,0.703671
9331,2.1,-0.3,1.0,-0.000000,18.712500,38.195833,0.702992


In [338]:
# normalize with min-max the Suction Pump column beetwen 0 and 1
df["Suction Pump"] = (df["Suction Pump"] - df["Suction Pump"].min() ) / (df["Suction Pump"].max() - df["Suction Pump"].min())
df

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
0,4.8,1.9,7.0,0.000000,10.412500,62.208333,0.779629
1,6.9,2.1,5.0,0.000000,10.262500,63.108333,0.784062
2,6.1,-0.8,0.0,0.301323,10.166667,63.683333,0.786692
3,3.9,-2.2,0.0,1.000000,10.087500,63.850000,0.784825
4,1.5,-2.4,0.0,0.717769,9.962500,64.008333,0.780729
...,...,...,...,...,...,...,...
9328,3.1,-0.8,0.0,0.450891,18.700000,37.941667,0.694262
9329,2.4,-0.7,0.0,0.348074,18.712500,38.125000,0.700287
9330,2.4,0.0,0.0,0.099655,18.729167,38.204167,0.703671
9331,2.1,-0.3,1.0,0.000000,18.712500,38.195833,0.702992


In [339]:
df.describe()

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
count,9333.0,9333.0,9333.0,9333.0,9333.0,9333.0,9333.0
mean,2.131823,-7.5e-05,1.874638,0.188332,18.242706,49.192027,1.020341
std,1.414137,0.747945,2.891474,0.250947,7.886612,12.928774,0.389219
min,0.1,-5.1,0.0,0.0,1.041667,18.691667,0.231113
25%,1.1,-0.3,0.0,0.0,11.80625,38.983333,0.735113
50%,1.8,0.0,0.0,0.099655,17.825,47.9875,0.998054
75%,2.95,0.2,3.0,0.348074,24.829167,59.070833,1.29635
max,11.9,4.6,9.0,1.0,33.191667,81.141667,2.0206


In [340]:
df.to_csv("data.csv")