In [36]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [37]:
df = pd.read_csv('variables.csv', index_col=0)
df

Unnamed: 0,CO(mg/m^3),Volume(m^3),N_people,Ambient-Air-Pump(L/min),Ambient-Air-Pump_power(%),Ambient-Air-Pump_number,CO(mg/m^3)_Dt
0,5.655523,20.705141,0.000000,418.161628,29.781933,3,5.350641
1,7.370026,28.080885,5.872497,333.977500,63.226408,3,7.721080
2,6.211477,20.047931,0.000000,608.093630,86.656211,8,5.113339
3,5.615021,12.017400,9.508670,698.808887,86.251549,8,5.135903
4,4.365763,20.551640,3.429715,562.049570,20.594074,4,4.977466
...,...,...,...,...,...,...,...
19995,0.026110,14.521764,0.000000,504.581243,68.026447,4,0.063048
19996,0.061070,21.680377,0.000000,306.288852,11.767788,6,0.098633
19997,0.099518,26.692921,0.000000,257.708970,32.729522,7,0.077012
19998,0.066477,11.676167,0.000000,533.694343,0.000000,8,0.097035


In [38]:
#columns = df.columns.to_list()
#scaler = StandardScaler()
#df = scaler.fit_transform(df.values)
#df = pd.DataFrame(df, columns=columns)
#df

In [39]:
#from pyod.models.ecod import ECOD
#from pyod.models.knn import KNN
from pyod.models.iforest import IForest

df_results_pred = pd.DataFrame()
df_results_scores = pd.DataFrame()

models_dict = {
    #'ECOD': ECOD(),
    #'KNN': KNN(),
    'IForest': IForest(),
}

for model_name, model in models_dict.items():
    print(f'Fitting {model_name}...')
    model.fit(df.values)
    predictions = model.predict(df.values)
    scores = model.decision_scores_
    df_results_pred[model_name+"_p"] = predictions
    df_results_scores[model_name+"_s"] = scores

Fitting IForest...


In [40]:
# get the number of outliers (where all models agree)
df_results_pred['sum'] = df_results_pred.sum(axis=1)
df_results_pred['outlier'] = df_results_pred['sum'] == len(models_dict)
df_results_pred

Unnamed: 0,IForest_p,sum,outlier
0,0,0,False
1,0,0,False
2,0,0,False
3,1,1,True
4,0,0,False
...,...,...,...
119995,0,0,False
119996,0,0,False
119997,0,0,False
119998,0,0,False


In [47]:
df_results_pred["sum"].value_counts()

sum
0    108000
1     12000
Name: count, dtype: int64

In [42]:
# normalize df with min-max normalization (esclude label column)
df_results_scores = (df_results_scores - df_results_scores.min()) / (df_results_scores.max() - df_results_scores.min())
# fill NaN values with 0
df_results_scores = df_results_scores.fillna(0)
df_results_scores["average_score"] = df_results_scores.mean(axis=1)
df_results_scores["average_score"] = df_results_scores["average_score"].round(3)
df_results_scores

Unnamed: 0,IForest_s,average_score
0,0.112481,0.112
1,0.430378,0.430
2,0.430422,0.430
3,0.869972,0.870
4,0.236649,0.237
...,...,...
119995,0.311084,0.311
119996,0.199052,0.199
119997,0.386290,0.386
119998,0.385580,0.386


In [43]:
outliers_index = df_results_pred[df_results_pred['outlier'] == True].index
df_results_scores = df_results_scores.loc[outliers_index]
df_results_scores

Unnamed: 0,IForest_s,average_score
3,0.869972,0.870
8,0.846877,0.847
18,0.650938,0.651
52,0.622428,0.622
65,0.615936,0.616
...,...,...
119677,0.644344,0.644
119740,0.759263,0.759
119753,0.701689,0.702
119845,0.646923,0.647


In [44]:
# plot the average score for each label with plotly
fig = px.scatter(df_results_scores, x=df_results_scores.index,
                  y="average_score",
                  title="Average score for each label", size="average_score", size_max=20, opacity=0.5)
fig.show()

In [45]:
# create movin average column of average_score
df_results_scores['mov_avg'] = df_results_scores['average_score'].rolling(50).mean()
df_results_scores

Unnamed: 0,IForest_s,average_score,mov_avg
3,0.869972,0.870,
8,0.846877,0.847,
18,0.650938,0.651,
52,0.622428,0.622,
65,0.615936,0.616,
...,...,...,...
119677,0.644344,0.644,0.62710
119740,0.759263,0.759,0.62928
119753,0.701689,0.702,0.63168
119845,0.646923,0.647,0.63148


In [46]:
# plot the moving average score for each label with plotly using line chart
fig = px.line(df_results_scores.iloc[50:], x=df_results_scores.iloc[50:].index,
                  y="mov_avg",
                  title="Moving average score for each label")
fig.show()

In [48]:
# save model to pickle
import pickle
filename = 'iforest_model_variables.pkl'
with open(filename, 'wb') as file:
    pickle.dump(models_dict['IForest'], file)
#pickle.dump(models_dict['IForest'], open(filename, 'wb'))