In [20]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

In [21]:
# Function to add random noise
def add_noise(x):
    noise = (np.random.rand(1)[0]*2 -1)* 0.05 *x  # Generating random noise (5% of each element)
    return x + noise

In [22]:
# create 10000 random numbers between 0 and 5.725 (mg/m^3) CO
np.random.seed(0)
size = 1000000
CO = np.random.rand(size) * 10.305
mq = np.random.rand(size) * 20 + 10 # 1m^2 per person = 3m^3 per person
df = pd.DataFrame(CO, columns=["CO(mg/m^3)_initial"])
df["Volume(m^3)"] = mq
df["N_people_MA_prev_10min"] = np.random.rand(size) * 10
df["N_people_MA_prev_10min"] = df["N_people_MA_prev_10min"].apply(lambda x: 0.0 if np.random.rand(1)[0] > 0.5 else x)
df["CO_produced(mg)"] = (df["N_people_MA_prev_10min"] * ( (np.random.rand(size) * 0.38 + 0.02)/60 ) * 1144)
df["CO_produced(mg/m^3)"] = df["CO_produced(mg)"]  / df["Volume(m^3)"]
df["Ambient-Air-Pump(L/min)"] = (np.random.rand(size) * 20 + 5) * 28.3168
df["Ambient-Air-Pump_power(%)"] = np.random.rand(size) * 100
df["Ambient-Air-Pump_number"] = np.random.randint(1, 9, size)
# half values of Ambient-Air-Pump_power(%) to zero
df["Ambient-Air-Pump_power(%)"] = df["Ambient-Air-Pump_power(%)"].apply(lambda x: 0.0 if np.random.rand(1)[0] > 0.5 else x)
# random value between 0.02 and 0.38 (m^3)CO person per hour
df["CO_removed(mg)"] =  (df["Ambient-Air-Pump(L/min)"] * (df["Ambient-Air-Pump_power(%)"] / 100) * 0.001) * (df["CO(mg/m^3)_initial"]+df["CO_produced(mg/m^3)"])
# add noise to CO_removed(mg)
df["CO_removed(mg)"] = df["CO_removed(mg)"].apply(lambda x: add_noise(x))
df["CO_removed(mg)"] = df["CO_removed(mg)"] *  df["Ambient-Air-Pump_number"]
df["CO(mg/m^3)_final"] = (df["CO(mg/m^3)_initial"] * df["Volume(m^3)"] + df["CO_produced(mg)"] - df["CO_removed(mg)"])/df["Volume(m^3)"]
# add noise to CO(mg/m^3)_final
df["CO(mg/m^3)_final"] = df["CO(mg/m^3)_final"].apply(lambda x: add_noise(x))
df["CO(mg/m^3)_final"] = df["CO(mg/m^3)_final"].apply(lambda x: 0 if x <0 else x)
df.head(20)

Unnamed: 0,CO(mg/m^3)_initial,Volume(m^3),N_people_MA_prev_10min,CO_produced(mg),CO_produced(mg/m^3),Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO_removed(mg),CO(mg/m^3)_final
0,5.655523,23.791121,7.895086,12.436262,0.522727,676.278801,57.561797,8,18.406894,5.556082
1,7.370026,29.49034,0.0,0.0,0.0,364.445631,72.454714,5,9.452181,7.176442
2,6.211477,26.161183,9.27092,17.216334,0.658087,165.086766,46.293581,8,4.022264,6.754297
3,5.615021,13.616384,0.0,0.0,0.0,300.197701,0.0,7,0.0,5.546475
4,4.365763,27.108231,0.0,0.0,0.0,467.855438,0.0,3,0.0,4.397795
5,6.655939,15.593005,6.200802,28.44776,1.824392,289.336001,0.0,6,0.0,8.13049
6,4.509336,29.114635,0.0,0.0,0.0,352.923606,0.0,8,0.0,4.496858
7,9.189721,10.731916,8.543815,45.67896,4.256366,514.00508,73.638183,2,9.892345,12.528656
8,9.930545,22.186985,0.0,0.0,0.0,195.893533,73.595811,2,2.949218,9.308475
9,3.951365,21.950645,1.644188,9.916397,0.451759,317.488119,0.0,2,0.0,4.471221


In [23]:
df[["CO_produced(mg)","CO_removed(mg)"]].describe()

Unnamed: 0,CO_produced(mg),CO_removed(mg)
count,1000000.0,1000000.0
mean,10.001244,2.724004
std,15.473836,5.448318
min,0.0,0.0
25%,0.0,0.0
50%,0.011424,0.0
75%,15.390168,2.990771
max,76.220435,77.063526


In [24]:
max(df["CO_produced(mg)"] - df["CO_removed(mg)"])

76.05329409062759

In [25]:
min(df["CO_produced(mg)"] - df["CO_removed(mg)"])

-56.40117678944809

In [26]:
max(df["CO(mg/m^3)_final"] - df["CO(mg/m^3)_initial"])

7.7902553137535495

In [27]:
min(df["CO(mg/m^3)_final"] - df["CO(mg/m^3)_initial"])

-4.648211965656768

In [28]:
df[df["CO(mg/m^3)_final"] < df["CO(mg/m^3)_initial"]].count()

CO(mg/m^3)_initial           391117
Volume(m^3)                  391117
N_people_MA_prev_10min       391117
CO_produced(mg)              391117
CO_produced(mg/m^3)          391117
Ambient-Air-Pump(L/min)      391117
Ambient-Air-Pump_power(%)    391117
Ambient-Air-Pump_number      391117
CO_removed(mg)               391117
CO(mg/m^3)_final             391117
dtype: int64

In [29]:
df[df["CO(mg/m^3)_final"] < 0.0].count()

CO(mg/m^3)_initial           0
Volume(m^3)                  0
N_people_MA_prev_10min       0
CO_produced(mg)              0
CO_produced(mg/m^3)          0
Ambient-Air-Pump(L/min)      0
Ambient-Air-Pump_power(%)    0
Ambient-Air-Pump_number      0
CO_removed(mg)               0
CO(mg/m^3)_final             0
dtype: int64

In [30]:
# count values of CO(mg/m^3)_final > 10.305
df[df["CO(mg/m^3)_final"] > 10.305].count()

CO(mg/m^3)_initial           48236
Volume(m^3)                  48236
N_people_MA_prev_10min       48236
CO_produced(mg)              48236
CO_produced(mg/m^3)          48236
Ambient-Air-Pump(L/min)      48236
Ambient-Air-Pump_power(%)    48236
Ambient-Air-Pump_number      48236
CO_removed(mg)               48236
CO(mg/m^3)_final             48236
dtype: int64

In [31]:
# count values of CO(mg/m^3)_final > 10.305
df[df["CO(mg/m^3)_final"] > 6.87].count()

CO(mg/m^3)_initial           366857
Volume(m^3)                  366857
N_people_MA_prev_10min       366857
CO_produced(mg)              366857
CO_produced(mg/m^3)          366857
Ambient-Air-Pump(L/min)      366857
Ambient-Air-Pump_power(%)    366857
Ambient-Air-Pump_number      366857
CO_removed(mg)               366857
CO(mg/m^3)_final             366857
dtype: int64

In [32]:
# plot all the curves with plotly
x= 0
fig = px.line(df.iloc[:100],
              markers=True, title="Normalized 2 Air Quality Data")
fig.show()

In [33]:
df.describe()

Unnamed: 0,CO(mg/m^3)_initial,Volume(m^3),N_people_MA_prev_10min,CO_produced(mg),CO_produced(mg/m^3),Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO_removed(mg),CO(mg/m^3)_final
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,5.156495,19.997888,2.498624,10.001244,0.549499,424.779264,25.003064,4.499885,2.724004,5.55477
std,2.975014,5.776837,3.224169,15.473836,0.911822,163.353961,32.284723,2.291805,5.448318,3.046797
min,7e-06,10.000027,0.0,0.0,0.0,141.585044,0.0,1.0,0.0,6.1e-05
25%,2.579592,14.988283,0.0,0.0,0.0,283.30842,0.0,3.0,0.0,3.012592
50%,5.160827,20.006333,0.004633,0.011424,0.000614,424.89582,0.0,4.0,0.0,5.539935
75%,7.737146,24.99722,4.993513,15.390168,0.789957,566.168646,50.007289,7.0,2.990771,8.041822
max,10.304997,29.999934,9.999948,76.220435,7.461639,707.919908,99.999663,8.0,77.063526,17.616381


In [34]:
df.columns

Index(['CO(mg/m^3)_initial', 'Volume(m^3)', 'N_people_MA_prev_10min',
       'CO_produced(mg)', 'CO_produced(mg/m^3)', 'Ambient-Air-Pump(L/min)',
       'Ambient-Air-Pump_power(%)', 'Ambient-Air-Pump_number',
       'CO_removed(mg)', 'CO(mg/m^3)_final'],
      dtype='object')

In [35]:
df.drop(columns=['CO_produced(mg)', 'CO_produced(mg/m^3)','CO_removed(mg)', 'CO_removed(mg)/m^3'], inplace=True)
df

KeyError: "['CO_removed(mg)/m^3'] not found in axis"

In [None]:
portion = int(size/5)
df_neutral = df.iloc[:portion].copy()
df_neutral["CO(mg/m^3)_initial"] = np.random.rand(portion) * 0.1
df_neutral["N_people_MA_prev_10min"] = np.random.rand(portion) * 0
df_neutral["CO(mg/m^3)_final"] = np.random.rand(portion) * 0.1
df_neutral

Unnamed: 0,CO(mg/m^3)_initial,Volume(m^3),N_people_MA_prev_10min,Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO(mg/m^3)_final
0,0.061924,23.791121,0.0,676.278801,57.561797,8,0.016401
1,0.077828,29.490340,0.0,364.445631,72.454714,5,0.027245
2,0.021968,26.161183,0.0,165.086766,46.293581,8,0.072359
3,0.054172,13.616384,0.0,300.197701,0.000000,7,0.084852
4,0.091142,27.108231,0.0,467.855438,0.000000,3,0.061149
...,...,...,...,...,...,...,...
199995,0.023162,19.710060,0.0,236.547986,0.000000,4,0.041481
199996,0.026660,22.955374,0.0,392.463541,0.000000,8,0.084421
199997,0.085897,11.771147,0.0,518.231715,0.000000,1,0.001508
199998,0.038675,11.564071,0.0,530.232776,0.000000,4,0.080127


In [None]:
df = pd.concat([df, df_neutral])
df

Unnamed: 0,CO(mg/m^3)_initial,Volume(m^3),N_people_MA_prev_10min,Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO(mg/m^3)_final
0,5.655523,23.791121,7.895086,676.278801,57.561797,8,5.556082
1,7.370026,29.490340,0.000000,364.445631,72.454714,5,7.176442
2,6.211477,26.161183,9.270920,165.086766,46.293581,8,6.754297
3,5.615021,13.616384,0.000000,300.197701,0.000000,7,5.546475
4,4.365763,27.108231,0.000000,467.855438,0.000000,3,4.397795
...,...,...,...,...,...,...,...
199995,0.023162,19.710060,0.000000,236.547986,0.000000,4,0.041481
199996,0.026660,22.955374,0.000000,392.463541,0.000000,8,0.084421
199997,0.085897,11.771147,0.000000,518.231715,0.000000,1,0.001508
199998,0.038675,11.564071,0.000000,530.232776,0.000000,4,0.080127


In [None]:
df.to_csv("../data/data.csv")