In [62]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import pickle

In [63]:
df = pd.read_csv('variables.csv', index_col=0)
df.drop(columns=["CO(mg/m^3)_Dt"], inplace=True)
df

Unnamed: 0,CO(mg/m^3),Volume(m^3),N_people,Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number
0,5.655523,20.705141,0.000000,418.161628,29.781933,3
1,7.370026,28.080885,5.872497,333.977500,63.226408,3
2,6.211477,20.047931,0.000000,608.093630,86.656211,8
3,5.615021,12.017400,9.508670,698.808887,86.251549,8
4,4.365763,20.551640,3.429715,562.049570,20.594074,4
...,...,...,...,...,...,...
19995,0.026110,14.521764,0.000000,504.581243,68.026447,4
19996,0.061070,21.680377,0.000000,306.288852,11.767788,6
19997,0.099518,26.692921,0.000000,257.708970,32.729522,7
19998,0.066477,11.676167,0.000000,533.694343,0.000000,8


In [64]:
#columns = df.columns.to_list()
#scaler = StandardScaler()
#df = scaler.fit_transform(df.values)
#df = pd.DataFrame(df, columns=columns)
#df

In [65]:
#from pyod.models.ecod import ECOD
#from pyod.models.knn import KNN
from pyod.models.iforest import IForest

df_results_pred = pd.DataFrame()
df_results_scores = pd.DataFrame()

models_dict = {
    #'ECOD': ECOD(),
    #'KNN': KNN(),
    'IForest': IForest(),
}

for model_name, model in models_dict.items():
    print(f'Fitting {model_name}...')
    model.fit(df.values)
    predictions = model.predict(df.values)
    scores = model.decision_scores_
    df_results_pred[model_name+"_p"] = predictions
    df_results_scores[model_name+"_s"] = scores

Fitting IForest...


In [66]:
# get the number of outliers (where all models agree)
df_results_pred['sum'] = df_results_pred.sum(axis=1)
df_results_pred['outlier'] = df_results_pred['sum'] == len(models_dict)
df_results_pred

Unnamed: 0,IForest_p,sum,outlier
0,0,0,False
1,0,0,False
2,0,0,False
3,1,1,True
4,0,0,False
...,...,...,...
119995,0,0,False
119996,0,0,False
119997,0,0,False
119998,0,0,False


In [67]:
df_results_pred["sum"].value_counts()

sum
0    108000
1     12000
Name: count, dtype: int64

In [68]:
# normalize df with min-max normalization (esclude label column)
df_results_scores = (df_results_scores - df_results_scores.min()) / (df_results_scores.max() - df_results_scores.min())
# fill NaN values with 0
df_results_scores = df_results_scores.fillna(0)
df_results_scores["average_score"] = df_results_scores.mean(axis=1)
df_results_scores["average_score"] = df_results_scores["average_score"].round(3)
df_results_scores

Unnamed: 0,IForest_s,average_score
0,0.141933,0.142
1,0.475742,0.476
2,0.525590,0.526
3,0.854518,0.855
4,0.222027,0.222
...,...,...
119995,0.196949,0.197
119996,0.125697,0.126
119997,0.302068,0.302
119998,0.309553,0.310


In [69]:
outliers_index = df_results_pred[df_results_pred['outlier'] == True].index
df_results_scores = df_results_scores.loc[outliers_index]
df_results_scores

Unnamed: 0,IForest_s,average_score
3,0.854518,0.855
8,0.895783,0.896
10,0.626179,0.626
18,0.729052,0.729
34,0.597499,0.597
...,...,...
119352,0.639375,0.639
119566,0.589465,0.589
119677,0.620298,0.620
119740,0.682702,0.683


In [70]:
# plot the average score for each label with plotly
fig = px.scatter(df_results_scores, x=df_results_scores.index,
                  y="average_score",
                  title="Average score all", size="average_score", size_max=20, opacity=0.5)
fig.show()

In [71]:
# create movin average column of average_score
df_results_scores['mov_avg'] = df_results_scores['average_score'].rolling(50).mean()
df_results_scores

Unnamed: 0,IForest_s,average_score,mov_avg
3,0.854518,0.855,
8,0.895783,0.896,
10,0.626179,0.626,
18,0.729052,0.729,
34,0.597499,0.597,
...,...,...,...
119352,0.639375,0.639,0.63200
119566,0.589465,0.589,0.63110
119677,0.620298,0.620,0.63052
119740,0.682702,0.683,0.63060


In [72]:
# plot the moving average score for each label with plotly using line chart
fig = px.line(df_results_scores.iloc[50:], x=df_results_scores.iloc[50:].index,
                  y="mov_avg",
                  title="Moving average score for each label")
fig.show()

In [73]:
# save model to pickle
filename = 'iforest_model_variables.pkl'
with open(filename, 'wb') as file:
    pickle.dump(models_dict['IForest'], file)
pickle.dump(models_dict['IForest'], open(filename, 'wb'))

In [74]:
linear_regression_model = pickle.load(open('predictor.pkl', 'rb'))

In [75]:
df.iloc[0]

CO(mg/m^3)                     5.655523
Volume(m^3)                   20.705141
N_people                       0.000000
Ambient-Air-Pump(L/min)      418.161628
Ambient-Air-Pump_power(%)     29.781933
Ambient-Air-Pump_number        3.000000
Name: 0, dtype: float64

In [76]:
dict_status = df.iloc[0].to_dict()
dict_status

{'CO(mg/m^3)': 5.655523157971081,
 'Volume(m^3)': 20.70514146559748,
 'N_people': 0.0,
 'Ambient-Air-Pump(L/min)': 418.1616283044987,
 'Ambient-Air-Pump_power(%)': 29.781933167111653,
 'Ambient-Air-Pump_number': 3.0}

In [77]:
linear_regression_model.predict(pd.DataFrame([dict_status.values()], columns=dict_status.keys()))[0][0]

5.516447640798208

In [78]:
np.array(list(dict_status.values())).reshape(1, -1)

array([[  5.65552316,  20.70514147,   0.        , 418.1616283 ,
         29.78193317,   3.        ]])

In [79]:
models_dict['IForest'].decision_function(np.array(list(dict_status.values())).reshape(1, -1))[0]

-0.10839061492791285