In [42]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler

In [43]:
# Function to add random noise
def add_noise(x):
    noise = (np.random.rand(1)[0]*2 -1)* 0.05 *x  # Generating random noise (5% of each element)
    return x + noise

In [44]:
np.random.seed(0)
size = 100000
CO = np.random.rand(size) * 10.305
mq = np.random.rand(size) * 20 + 10 # 1m^2 per person = 3m^3 per person
df = pd.DataFrame(CO, columns=["CO(mg/m^3)"])
df["Volume(m^3)"] = mq
df["N_people"] = np.random.rand(size) * 10
df["N_people"] = df["N_people"].apply(lambda x: 0.0 if np.random.rand(1)[0] > 0.5 else x)
df["CO_produced(mg)"] = (df["N_people"] * ( (np.random.rand(size) * 0.38 + 0.02)/60 ) * 1144)
df["CO_produced(mg/m^3)"] = df["CO_produced(mg)"]  / df["Volume(m^3)"]
df["Ambient-Air-Pump(L/min)"] = (np.random.rand(size) * 20 + 5) * 28.3168
df["Ambient-Air-Pump_power(%)"] = np.random.rand(size) * 100
df["Ambient-Air-Pump_number"] = np.random.randint(1, 9, size)
# half values of Ambient-Air-Pump_power(%) to zero
df["Ambient-Air-Pump_power(%)"] = df["Ambient-Air-Pump_power(%)"].apply(lambda x: 0.0 if np.random.rand(1)[0] > 0.5 else x)
# random value between 0.02 and 0.38 (m^3)CO person per hour
df["CO_removed(mg)"] =  (df["Ambient-Air-Pump(L/min)"] * (df["Ambient-Air-Pump_power(%)"] / 100) * 0.001) * (df["CO(mg/m^3)"]+df["CO_produced(mg/m^3)"])
# add noise to CO_removed(mg)
df["CO_removed(mg)"] = df["CO_removed(mg)"].apply(lambda x: add_noise(x))
df["CO_removed(mg)"] = df["CO_removed(mg)"] *  df["Ambient-Air-Pump_number"]
df["CO(mg/m^3)_Dt"] = (df["CO(mg/m^3)"] * df["Volume(m^3)"] + df["CO_produced(mg)"] - df["CO_removed(mg)"])/df["Volume(m^3)"]
# add noise to CO(mg/m^3)_final
df["CO(mg/m^3)_Dt"] = df["CO(mg/m^3)_Dt"].apply(lambda x: add_noise(x))
df["CO(mg/m^3)_Dt"] = df["CO(mg/m^3)_Dt"].apply(lambda x: 0 if x <0 else x)
df.head(20)

Unnamed: 0,CO(mg/m^3),Volume(m^3),N_people,CO_produced(mg),CO_produced(mg/m^3),Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO_removed(mg),CO(mg/m^3)_Dt
0,5.655523,20.705141,0.0,0.0,0.0,418.161628,29.781933,3,2.036879,5.350641
1,7.370026,28.080885,5.872497,25.241117,0.898872,333.9775,63.226408,3,4.978281,7.72108
2,6.211477,20.047931,0.0,0.0,0.0,608.09363,86.656211,8,26.448582,5.113339
3,5.615021,12.0174,9.50867,41.027221,3.413985,698.808887,86.251549,8,44.486658,5.135903
4,4.365763,20.55164,3.429715,10.477506,0.509814,562.04957,20.594074,4,2.358848,4.977466
5,6.655939,24.224579,0.0,0.0,0.0,189.129873,0.0,4,0.0,6.458114
6,4.509336,16.259086,5.857313,12.309248,0.757069,634.707358,0.0,6,0.0,5.357209
7,9.189721,11.006507,0.0,0.0,0.0,234.860143,68.459957,6,8.996612,8.306562
8,9.930545,12.465641,8.532363,53.404426,4.28413,697.523011,99.929402,6,58.922566,9.358962
9,3.951365,25.593815,5.303135,4.871354,0.190333,613.32125,0.0,5,0.0,4.021957


In [45]:
df[["CO_produced(mg)","CO_removed(mg)"]].describe()

Unnamed: 0,CO_produced(mg),CO_removed(mg)
count,100000.0,100000.0
mean,9.970757,2.690544
std,15.395838,5.381303
min,0.0,0.0
25%,0.0,0.0
50%,0.099907,0.0
75%,15.414612,2.959772
max,76.125072,69.315856


In [46]:
max(df["CO_produced(mg)"] - df["CO_removed(mg)"])

76.12507219548829

In [47]:
min(df["CO_produced(mg)"] - df["CO_removed(mg)"])

-51.06793514980636

In [48]:
max(df["CO(mg/m^3)_Dt"] - df["CO(mg/m^3)"])

7.456573686977912

In [49]:
min(df["CO(mg/m^3)_Dt"] - df["CO(mg/m^3)"])

-4.063558295372724

In [50]:
df[df["CO(mg/m^3)_Dt"] < df["CO(mg/m^3)"]].count()

CO(mg/m^3)                   39163
Volume(m^3)                  39163
N_people                     39163
CO_produced(mg)              39163
CO_produced(mg/m^3)          39163
Ambient-Air-Pump(L/min)      39163
Ambient-Air-Pump_power(%)    39163
Ambient-Air-Pump_number      39163
CO_removed(mg)               39163
CO(mg/m^3)_Dt                39163
dtype: int64

In [51]:
df[df["CO(mg/m^3)_Dt"] < 0.0].count()

CO(mg/m^3)                   0
Volume(m^3)                  0
N_people                     0
CO_produced(mg)              0
CO_produced(mg/m^3)          0
Ambient-Air-Pump(L/min)      0
Ambient-Air-Pump_power(%)    0
Ambient-Air-Pump_number      0
CO_removed(mg)               0
CO(mg/m^3)_Dt                0
dtype: int64

In [52]:
# count values of CO(mg/m^3)_final > 10.305
df[df["CO(mg/m^3)_Dt"] > 10.305].count()

CO(mg/m^3)                   4738
Volume(m^3)                  4738
N_people                     4738
CO_produced(mg)              4738
CO_produced(mg/m^3)          4738
Ambient-Air-Pump(L/min)      4738
Ambient-Air-Pump_power(%)    4738
Ambient-Air-Pump_number      4738
CO_removed(mg)               4738
CO(mg/m^3)_Dt                4738
dtype: int64

In [53]:
# count values of CO(mg/m^3)_final > 10.305
df[df["CO(mg/m^3)_Dt"] > 6.87].count()

CO(mg/m^3)                   36543
Volume(m^3)                  36543
N_people                     36543
CO_produced(mg)              36543
CO_produced(mg/m^3)          36543
Ambient-Air-Pump(L/min)      36543
Ambient-Air-Pump_power(%)    36543
Ambient-Air-Pump_number      36543
CO_removed(mg)               36543
CO(mg/m^3)_Dt                36543
dtype: int64

In [54]:
# plot all the curves with plotly
x= 0
fig = px.line(df.iloc[:100],
              markers=True, title="Normalized 2 Air Quality Data")
fig.show()

In [55]:
df.describe()

Unnamed: 0,CO(mg/m^3),Volume(m^3),N_people,CO_produced(mg),CO_produced(mg/m^3),Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO_removed(mg),CO(mg/m^3)_Dt
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,5.147348,20.045674,2.49166,9.970757,0.546575,425.034166,25.013895,4.484,2.690544,5.544861
std,2.978533,5.77402,3.215172,15.395838,0.905888,163.68391,32.272862,2.295072,5.381303,3.046434
min,3.4e-05,10.000271,0.0,0.0,0.0,141.584755,0.0,1.0,0.0,3.3e-05
25%,2.556112,15.040729,0.0,0.0,0.0,283.370453,0.0,2.0,0.0,2.999801
50%,5.145504,20.086929,0.041643,0.099907,0.005004,425.73062,0.0,4.0,0.0,5.528771
75%,7.727479,25.064673,4.977854,15.414612,0.787751,567.177713,49.925584,6.0,2.959772,8.04226
max,10.304773,29.99998,9.999944,76.125072,7.297462,707.913997,99.997702,8.0,69.315856,17.488292


In [56]:
df.columns

Index(['CO(mg/m^3)', 'Volume(m^3)', 'N_people', 'CO_produced(mg)',
       'CO_produced(mg/m^3)', 'Ambient-Air-Pump(L/min)',
       'Ambient-Air-Pump_power(%)', 'Ambient-Air-Pump_number',
       'CO_removed(mg)', 'CO(mg/m^3)_Dt'],
      dtype='object')

In [57]:
df.drop(columns=['CO_produced(mg)', 'CO_produced(mg/m^3)','CO_removed(mg)'], inplace=True)
df

Unnamed: 0,CO(mg/m^3),Volume(m^3),N_people,Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO(mg/m^3)_Dt
0,5.655523,20.705141,0.000000,418.161628,29.781933,3,5.350641
1,7.370026,28.080885,5.872497,333.977500,63.226408,3,7.721080
2,6.211477,20.047931,0.000000,608.093630,86.656211,8,5.113339
3,5.615021,12.017400,9.508670,698.808887,86.251549,8,5.135903
4,4.365763,20.551640,3.429715,562.049570,20.594074,4,4.977466
...,...,...,...,...,...,...,...
99995,7.480617,17.373262,3.972694,536.936272,0.000000,1,8.572045
99996,5.229999,16.101459,3.442116,164.472045,64.028070,1,6.990580
99997,8.348419,14.292490,0.000000,569.316123,0.000000,7,8.552708
99998,5.678388,15.915532,3.359384,262.680757,53.566043,4,6.975994


In [58]:
portion = int(size/5)
df_neutral = df.iloc[:portion].copy()
df_neutral["CO(mg/m^3)"] = np.random.rand(portion) * 0.1
df_neutral["N_people"] = np.random.rand(portion) * 0
df_neutral["CO(mg/m^3)_Dt"] = np.random.rand(portion) * 0.1
df_neutral

Unnamed: 0,CO(mg/m^3),Volume(m^3),N_people,Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO(mg/m^3)_Dt
0,0.094053,20.705141,0.0,418.161628,29.781933,3,0.034328
1,0.075952,28.080885,0.0,333.977500,63.226408,3,0.049857
2,0.053361,20.047931,0.0,608.093630,86.656211,8,0.091733
3,0.009153,12.017400,0.0,698.808887,86.251549,8,0.082107
4,0.018269,20.551640,0.0,562.049570,20.594074,4,0.043626
...,...,...,...,...,...,...,...
19995,0.026110,14.521764,0.0,504.581243,68.026447,4,0.063048
19996,0.061070,21.680377,0.0,306.288852,11.767788,6,0.098633
19997,0.099518,26.692921,0.0,257.708970,32.729522,7,0.077012
19998,0.066477,11.676167,0.0,533.694343,0.000000,8,0.097035


In [59]:
df = pd.concat([df, df_neutral])
df

Unnamed: 0,CO(mg/m^3),Volume(m^3),N_people,Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO(mg/m^3)_Dt
0,5.655523,20.705141,0.000000,418.161628,29.781933,3,5.350641
1,7.370026,28.080885,5.872497,333.977500,63.226408,3,7.721080
2,6.211477,20.047931,0.000000,608.093630,86.656211,8,5.113339
3,5.615021,12.017400,9.508670,698.808887,86.251549,8,5.135903
4,4.365763,20.551640,3.429715,562.049570,20.594074,4,4.977466
...,...,...,...,...,...,...,...
19995,0.026110,14.521764,0.000000,504.581243,68.026447,4,0.063048
19996,0.061070,21.680377,0.000000,306.288852,11.767788,6,0.098633
19997,0.099518,26.692921,0.000000,257.708970,32.729522,7,0.077012
19998,0.066477,11.676167,0.000000,533.694343,0.000000,8,0.097035


In [60]:
df.to_csv("variables.csv")