In [253]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

In [254]:
df = pd.read_csv('AirQuality.csv', sep=';')
#drop rows with all NaN
df = df.dropna(how='all')
df = df[["Time", "CO(GT)", "T", "RH", "AH"]]
df.rename(columns={"CO(GT)": "CO(mg/m^3)", "T": "Temperature(°C)", "RH": "Relative Humidity(%)", "AH": "Absolute Humidity(g/m^3)"}, inplace=True)
df["Time"] = df["Time"].str.split(".").str[0].astype(int)
df["CO(mg/m^3)"] = df["CO(mg/m^3)"].str.replace(",", ".").astype(float)
df["Temperature(°C)"] = df["Temperature(°C)"].str.replace(",", ".").astype(float)
df["Relative Humidity(%)"] = df["Relative Humidity(%)"].str.replace(",", ".").astype(float)
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"].str.replace(",", ".").astype(float)
# replace all -200 with NaN on the CO(GT) column
df["CO(mg/m^3)"] = df["CO(mg/m^3)"] .replace(-200, np.nan)
df["Temperature(°C)"] = df["Temperature(°C)"] .replace(-200, np.nan)
df["Relative Humidity(%)"] = df["Relative Humidity(%)"] .replace(-200, np.nan)
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"] .replace(-200, np.nan)
# fill NaN with the average of previous and next value
df["CO(mg/m^3)"] = df["CO(mg/m^3)"].fillna((df["CO(mg/m^3)"].fillna(method='ffill') + df["CO(mg/m^3)"].fillna(method='bfill'))/2)
df["Temperature(°C)"] = df["Temperature(°C)"].fillna((df["Temperature(°C)"].fillna(method='ffill') + df["Temperature(°C)"].fillna(method='bfill'))/2)
df["Relative Humidity(%)"] = df["Relative Humidity(%)"].fillna((df["Relative Humidity(%)"].fillna(method='ffill') + df["Relative Humidity(%)"].fillna(method='bfill'))/2)
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"].fillna((df["Absolute Humidity(g/m^3)"].fillna(method='ffill') + df["Absolute Humidity(g/m^3)"].fillna(method='bfill'))/2)
# add delta column
df["Delta CO(mg/m^3)"] = df["CO(mg/m^3)"].diff()
# drop first row
df = df.dropna()
time = df["Time"].values
df


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.


Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.



Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
1,19,2.0,13.3,47.7,0.7255,-0.6
2,20,2.2,11.9,54.0,0.7502,0.2
3,21,2.2,11.0,60.0,0.7867,0.0
4,22,1.6,11.2,59.6,0.7888,-0.6
5,23,1.2,11.2,59.2,0.7848,-0.4
...,...,...,...,...,...,...
9352,10,3.1,21.9,29.3,0.7568,-0.8
9353,11,2.4,24.3,23.7,0.7119,-0.7
9354,12,2.4,26.9,18.3,0.6406,0.0
9355,13,2.1,28.3,13.5,0.5139,-0.3


In [255]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9356 entries, 1 to 9356
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Time                      9356 non-null   int64  
 1   CO(mg/m^3)                9356 non-null   float64
 2   Temperature(°C)           9356 non-null   float64
 3   Relative Humidity(%)      9356 non-null   float64
 4   Absolute Humidity(g/m^3)  9356 non-null   float64
 5   Delta CO(mg/m^3)          9356 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 511.7 KB


In [256]:
# count of NaN values
df.isna().sum()

Time                        0
CO(mg/m^3)                  0
Temperature(°C)             0
Relative Humidity(%)        0
Absolute Humidity(g/m^3)    0
Delta CO(mg/m^3)            0
dtype: int64

In [257]:
# count nan of the column CO(mg/m^3)
df["CO(mg/m^3)"].isna().sum()

0

In [258]:
df.describe()

Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
count,9356.0,9356.0,9356.0,9356.0,9356.0,9356.0
mean,11.497862,2.130553,18.233903,49.191417,1.019649,-4.3e-05
std,6.923225,1.413028,8.773972,17.151238,0.402093,0.747343
min,0.0,0.1,-1.9,9.2,0.1847,-5.1
25%,5.0,1.1,11.6,35.7,0.7326,-0.3
50%,11.0,1.8,17.6,49.7,0.9875,0.0
75%,17.25,2.95,24.3,62.2,1.30685,0.2
max,23.0,11.9,44.6,88.7,2.231,4.6


In [259]:
# normalize all the columns to be between 0 and 23 using min-max normalization (sklearn MinMaxScaler)
scaler = MinMaxScaler()
df_normalized = df.copy()
df_normalized = scaler.fit_transform(df)
df_normalized = pd.DataFrame(df_normalized, columns=df.columns)
df_normalized["Time"] = df["Time"].astype(str)
df_normalized

Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
0,,0.161017,0.326882,0.484277,0.264282,0.463918
1,19,0.177966,0.296774,0.563522,0.276352,0.546392
2,20,0.177966,0.277419,0.638994,0.294190,0.525773
3,21,0.127119,0.281720,0.633962,0.295216,0.463918
4,22,0.093220,0.281720,0.628931,0.293261,0.484536
...,...,...,...,...,...,...
9351,9,0.254237,0.511828,0.252830,0.279578,0.443299
9352,10,0.194915,0.563441,0.182390,0.257636,0.453608
9353,11,0.194915,0.619355,0.114465,0.222792,0.525773
9354,12,0.169492,0.649462,0.054088,0.160876,0.494845


In [260]:
# plot all the curves with plotly
fig = px.line(df_normalized.iloc[:24], x="Time", y=["CO(mg/m^3)", "Delta CO(mg/m^3)"], markers=True, title="Normalized Air Quality Data")
fig.show()

In [261]:
# get dummy variables for time
df_dummy = pd.get_dummies(df, columns=["Time"], dtype=int)

In [262]:
corr = df_dummy.drop(columns=['Temperature(°C)', 'Relative Humidity(%)','Absolute Humidity(g/m^3)', 'Delta CO(mg/m^3)']).corr(method='pearson')
time_deltaCO = corr.iloc[1:, 0].sort_values(ascending=False)
time_deltaCO

Time_19    0.203256
Time_20    0.170196
Time_18    0.166210
Time_9     0.106252
Time_8     0.087158
Time_17    0.086992
Time_21    0.060494
Time_10    0.056785
Time_16    0.019180
Time_11    0.018596
Time_13    0.012597
Time_12    0.007658
Time_14    0.003325
Time_15   -0.006209
Time_22   -0.018249
Time_23   -0.031306
Time_7    -0.041146
Time_0    -0.042698
Time_1    -0.083214
Time_2    -0.128064
Time_6    -0.149334
Time_3    -0.154803
Time_4    -0.168202
Time_5    -0.175147
Name: CO(mg/m^3), dtype: float64

In [263]:
dict_correlations = time_deltaCO.to_dict()
dict_correlations_adjusted = {}
for key in dict_correlations:
    # take only the number of the time
    dict_correlations_adjusted[int(key.split("_")[1])] = dict_correlations[key]
dict_correlations_adjusted

{19: 0.20325577042136075,
 20: 0.17019571220152935,
 18: 0.16621013918990749,
 9: 0.10625184458572441,
 8: 0.08715762664421803,
 17: 0.08699208662411521,
 21: 0.06049384757624781,
 10: 0.05678476262428047,
 16: 0.019180221009914657,
 11: 0.01859632674126859,
 13: 0.012597449548545716,
 12: 0.007658310913527812,
 14: 0.0033247371686270875,
 15: -0.006208965892300527,
 22: -0.018248512041795853,
 23: -0.03130600498494658,
 7: -0.04114643444935001,
 0: -0.042698194480275196,
 1: -0.08321427040967465,
 2: -0.12806392008397485,
 6: -0.1493343868493396,
 3: -0.15480339476321017,
 4: -0.16820151795705165,
 5: -0.17514659029058288}

In [264]:
# count number of positive and negative correlations
positive_correlations = 0
negative_correlations = 0
for key in dict_correlations_adjusted:
    if dict_correlations_adjusted[key] > 0:
        positive_correlations += 1
    else:
        negative_correlations += 1
positive_correlations, negative_correlations

(13, 11)

In [265]:
# convert to integers (persons)
pos_count = positive_correlations
#neg_count = negative_correlations -1
for key in dict_correlations_adjusted:
    if dict_correlations_adjusted[key] > 0:
        dict_correlations_adjusted[key] = pos_count
        pos_count -= 1
#    else:
#        dict_correlations_adjusted[key] = (negative_correlations - neg_count) * -1
#        negative_correlations += 1
dict_correlations_adjusted

{19: 13,
 20: 12,
 18: 11,
 9: 10,
 8: 9,
 17: 8,
 21: 7,
 10: 6,
 16: 5,
 11: 4,
 13: 3,
 12: 2,
 14: 1,
 15: -0.006208965892300527,
 22: -0.018248512041795853,
 23: -0.03130600498494658,
 7: -0.04114643444935001,
 0: -0.042698194480275196,
 1: -0.08321427040967465,
 2: -0.12806392008397485,
 6: -0.1493343868493396,
 3: -0.15480339476321017,
 4: -0.16820151795705165,
 5: -0.17514659029058288}

In [266]:
fig = px.line(time_deltaCO.values, markers=True, title="Correlation between Time and Delta CO(mg/m^3)")
fig.show()

In [267]:
# plot the distribution of the CO(mg/m^3) column and the Delta CO(mg/m^3) column
fig = px.histogram(df, x="CO(mg/m^3)", title="CO(mg/m^3) distribution")
fig.show()

In [268]:
fig = px.histogram(df, x="Delta CO(mg/m^3)", title="Delta CO(mg/m^3) distribution")
fig.show()

In [269]:
# map dict_correlations_adjusted to df column "Time"
df["Time"] = df["Time"].map(dict_correlations_adjusted)
df

Unnamed: 0,Time,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3)
1,13.000000,2.0,13.3,47.7,0.7255,-0.6
2,12.000000,2.2,11.9,54.0,0.7502,0.2
3,7.000000,2.2,11.0,60.0,0.7867,0.0
4,-0.018249,1.6,11.2,59.6,0.7888,-0.6
5,-0.031306,1.2,11.2,59.2,0.7848,-0.4
...,...,...,...,...,...,...
9352,6.000000,3.1,21.9,29.3,0.7568,-0.8
9353,4.000000,2.4,24.3,23.7,0.7119,-0.7
9354,2.000000,2.4,26.9,18.3,0.6406,0.0
9355,3.000000,2.1,28.3,13.5,0.5139,-0.3


In [270]:
# create two new columns, one with the only positive values of Time and the other with the only negative values of Time, and drop the original Time column
df["People"] = df["Time"].apply(lambda x: x if x > 0 else 0)
df["Suction Pump"] = df["Time"].apply(lambda x: x if x < 0 else 0) * -1
df = df.drop(columns=["Time"])
df

Unnamed: 0,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3),People,Suction Pump
1,2.0,13.3,47.7,0.7255,-0.6,13.0,-0.000000
2,2.2,11.9,54.0,0.7502,0.2,12.0,-0.000000
3,2.2,11.0,60.0,0.7867,0.0,7.0,-0.000000
4,1.6,11.2,59.6,0.7888,-0.6,0.0,0.018249
5,1.2,11.2,59.2,0.7848,-0.4,0.0,0.031306
...,...,...,...,...,...,...,...
9352,3.1,21.9,29.3,0.7568,-0.8,6.0,-0.000000
9353,2.4,24.3,23.7,0.7119,-0.7,4.0,-0.000000
9354,2.4,26.9,18.3,0.6406,0.0,2.0,-0.000000
9355,2.1,28.3,13.5,0.5139,-0.3,3.0,-0.000000


In [271]:
# add moving average column of Temperature(°C) and Relative Humidity(%) and Absolute Humidity(g/m^3)
df["Temperature(°C)"] = df["Temperature(°C)"].rolling(window=24).mean()
df["Relative Humidity(%)"] = df["Relative Humidity(%)"].rolling(window=24).mean()
df["Absolute Humidity(g/m^3)"] = df["Absolute Humidity(g/m^3)"].rolling(window=24).mean()
df = df.dropna()
df

Unnamed: 0,CO(mg/m^3),Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3),Delta CO(mg/m^3),People,Suction Pump
24,4.8,10.412500,62.208333,0.779629,1.9,11.0,-0.000000
25,6.9,10.262500,63.108333,0.784062,2.1,13.0,-0.000000
26,6.1,10.166667,63.683333,0.786692,-0.8,12.0,-0.000000
27,3.9,10.087500,63.850000,0.784825,-2.2,7.0,-0.000000
28,1.5,9.962500,64.008333,0.780729,-2.4,0.0,0.018249
...,...,...,...,...,...,...,...
9352,3.1,18.700000,37.941667,0.694262,-0.8,6.0,-0.000000
9353,2.4,18.712500,38.125000,0.700287,-0.7,4.0,-0.000000
9354,2.4,18.729167,38.204167,0.703671,0.0,2.0,-0.000000
9355,2.1,18.712500,38.195833,0.702992,-0.3,3.0,-0.000000


In [272]:
df = df[['CO(mg/m^3)', 'Delta CO(mg/m^3)', 'People', 'Suction Pump',
            'Temperature(°C)', 'Relative Humidity(%)', 'Absolute Humidity(g/m^3)']]
df

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
24,4.8,1.9,11.0,-0.000000,10.412500,62.208333,0.779629
25,6.9,2.1,13.0,-0.000000,10.262500,63.108333,0.784062
26,6.1,-0.8,12.0,-0.000000,10.166667,63.683333,0.786692
27,3.9,-2.2,7.0,-0.000000,10.087500,63.850000,0.784825
28,1.5,-2.4,0.0,0.018249,9.962500,64.008333,0.780729
...,...,...,...,...,...,...,...
9352,3.1,-0.8,6.0,-0.000000,18.700000,37.941667,0.694262
9353,2.4,-0.7,4.0,-0.000000,18.712500,38.125000,0.700287
9354,2.4,0.0,2.0,-0.000000,18.729167,38.204167,0.703671
9355,2.1,-0.3,3.0,-0.000000,18.712500,38.195833,0.702992


In [273]:
corr = df.corr(method='pearson')
corr

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
CO(mg/m^3),1.0,0.264423,0.4516818,-0.430632,-0.089741,0.196593,0.00859655
Delta CO(mg/m^3),0.264423,1.0,0.1476703,-0.065887,-0.002147,-0.007466,-0.005921642
People,0.451682,0.14767,1.0,-0.573652,-6.9e-05,0.00012,-7.817127e-08
Suction Pump,-0.430632,-0.065887,-0.5736525,1.0,-0.000201,0.000215,-0.0001061231
Temperature(°C),-0.089741,-0.002147,-6.87579e-05,-0.000201,1.0,-0.468969,0.7924361
Relative Humidity(%),0.196593,-0.007466,0.000120397,0.000215,-0.468969,1.0,0.1111985
Absolute Humidity(g/m^3),0.008597,-0.005922,-7.817127e-08,-0.000106,0.792436,0.111199,1.0


In [274]:
# normalize all the columns to be between 0 and 23 using min-max normalization (sklearn MinMaxScaler)
scaler = MinMaxScaler()
df_normalized_2 = df.copy()
df_normalized_2 = scaler.fit_transform(df)
df_normalized_2 = pd.DataFrame(df_normalized_2, columns=df.columns)
df_normalized_2["Delta CO(mg/m^3)"] = df_normalized_2["CO(mg/m^3)"].diff()
df_normalized_2

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
0,0.398305,,0.846154,0.00000,0.291472,0.696824,0.306522
1,0.576271,0.177966,1.000000,0.00000,0.286807,0.711236,0.308999
2,0.508475,-0.067797,0.923077,0.00000,0.283826,0.720443,0.310468
3,0.322034,-0.186441,0.538462,0.00000,0.281363,0.723112,0.309425
4,0.118644,-0.203390,0.000000,0.10419,0.277475,0.725647,0.307136
...,...,...,...,...,...,...,...
9328,0.254237,-0.067797,0.461538,0.00000,0.549248,0.308247,0.258817
9329,0.194915,-0.059322,0.307692,0.00000,0.549637,0.311182,0.262184
9330,0.194915,0.000000,0.153846,0.00000,0.550156,0.312450,0.264075
9331,0.169492,-0.025424,0.230769,0.00000,0.549637,0.312317,0.263695


In [285]:
# plot all the curves with plotly
x= 0
fig = px.line(df.iloc[:], #x="Time", y=["CO(mg/m^3)", "Delta CO(mg/m^3)", ' People', 'Suction Pump'],
              markers=True, title="Normalized 2 Air Quality Data")
fig.show()

In [276]:
# restart the index from 0
df = df.reset_index(drop=True)
df

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
0,4.8,1.9,11.0,-0.000000,10.412500,62.208333,0.779629
1,6.9,2.1,13.0,-0.000000,10.262500,63.108333,0.784062
2,6.1,-0.8,12.0,-0.000000,10.166667,63.683333,0.786692
3,3.9,-2.2,7.0,-0.000000,10.087500,63.850000,0.784825
4,1.5,-2.4,0.0,0.018249,9.962500,64.008333,0.780729
...,...,...,...,...,...,...,...
9328,3.1,-0.8,6.0,-0.000000,18.700000,37.941667,0.694262
9329,2.4,-0.7,4.0,-0.000000,18.712500,38.125000,0.700287
9330,2.4,0.0,2.0,-0.000000,18.729167,38.204167,0.703671
9331,2.1,-0.3,3.0,-0.000000,18.712500,38.195833,0.702992


In [277]:
# normalize with min-max the Suction Pump column beetwen 0 and 1
df["Suction Pump"] = (df["Suction Pump"] - df["Suction Pump"].min() ) / (df["Suction Pump"].max() - df["Suction Pump"].min())
df

Unnamed: 0,CO(mg/m^3),Delta CO(mg/m^3),People,Suction Pump,Temperature(°C),Relative Humidity(%),Absolute Humidity(g/m^3)
0,4.8,1.9,11.0,0.00000,10.412500,62.208333,0.779629
1,6.9,2.1,13.0,0.00000,10.262500,63.108333,0.784062
2,6.1,-0.8,12.0,0.00000,10.166667,63.683333,0.786692
3,3.9,-2.2,7.0,0.00000,10.087500,63.850000,0.784825
4,1.5,-2.4,0.0,0.10419,9.962500,64.008333,0.780729
...,...,...,...,...,...,...,...
9328,3.1,-0.8,6.0,0.00000,18.700000,37.941667,0.694262
9329,2.4,-0.7,4.0,0.00000,18.712500,38.125000,0.700287
9330,2.4,0.0,2.0,0.00000,18.729167,38.204167,0.703671
9331,2.1,-0.3,3.0,0.00000,18.712500,38.195833,0.702992


In [278]:
df = df.drop(columns=["Delta CO(mg/m^3)",'Temperature(°C)', 'Relative Humidity(%)', 'Absolute Humidity(g/m^3)'])
df

Unnamed: 0,CO(mg/m^3),People,Suction Pump
0,4.8,11.0,0.00000
1,6.9,13.0,0.00000
2,6.1,12.0,0.00000
3,3.9,7.0,0.00000
4,1.5,0.0,0.10419
...,...,...,...
9328,3.1,6.0,0.00000
9329,2.4,4.0,0.00000
9330,2.4,2.0,0.00000
9331,2.1,3.0,0.00000


In [279]:
# add a row with People and Suction Pump = 0 and CO(mg/m^3) = 0.1 + random noise
df_neutral = pd.DataFrame({'CO(mg/m^3)': [0.05 for i in range(1000)], 'People': [0.0 for i in range(1000)], 'Suction Pump': [0.0 for i in range(1000)]})
# add random noise to the CO(mg/m^3) column
df_neutral["CO(mg/m^3)"] = df_neutral["CO(mg/m^3)"] + np.random.normal(0, 0.05, 1000)
# negative values to 0
df_neutral["CO(mg/m^3)"] = df_neutral["CO(mg/m^3)"].apply(lambda x: x if x > 0 else 0)
df_neutral

Unnamed: 0,CO(mg/m^3),People,Suction Pump
0,0.075034,0.0,0.0
1,0.003099,0.0,0.0
2,0.022322,0.0,0.0
3,0.066063,0.0,0.0
4,0.074102,0.0,0.0
...,...,...,...
995,0.028895,0.0,0.0
996,0.011514,0.0,0.0
997,0.081240,0.0,0.0
998,0.048168,0.0,0.0


In [280]:
# replace Success Pump values with random numbers between 0 and 1
df["Suction Pump"] = df["Suction Pump"].apply(lambda x: np.random.uniform(0, 1) if x == 0 else x)
# decrese the CO(mg/m^3) values a percentage determined by the Suction Pump values
df["CO(mg/m^3)"] = df["CO(mg/m^3)"] * (1 - df["Suction Pump"] * 0.1)

In [281]:
df = pd.concat([df_neutral, df], ignore_index=True)
df

Unnamed: 0,CO(mg/m^3),People,Suction Pump
0,0.075034,0.0,0.000000
1,0.003099,0.0,0.000000
2,0.022322,0.0,0.000000
3,0.066063,0.0,0.000000
4,0.074102,0.0,0.000000
...,...,...,...
10328,3.023261,6.0,0.247546
10329,2.323334,4.0,0.319441
10330,2.313576,2.0,0.360099
10331,1.992810,3.0,0.510426


In [282]:
df.describe()

Unnamed: 0,CO(mg/m^3),People,Suction Pump
count,10333.0,10333.0,10333.0
mean,1.840099,3.424562,0.458219
std,1.412174,4.370125,0.340231
min,0.0,0.0,0.0
25%,0.780497,0.0,0.158373
50%,1.576571,0.0,0.441858
75%,2.646801,7.0,0.792993
max,11.074714,13.0,1.0


In [283]:
df.to_csv("../data/data.csv")