In [14]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [15]:
df = pd.read_csv('detected.csv')
df

Unnamed: 0,person,bicycle,car,motorcycle,airplane,bus,train,truck,boat,traffic light,...,toaster,sink,refrigerator,book,clock,vase,scissors,teddy bear,hair drier,toothbrush
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,7,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,8,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,7,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,7,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,7,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1926,1,0,0,0,0,0,0,0,0,0,...,0,0,0,5,0,0,0,0,0,0
1927,1,0,0,0,0,0,0,0,0,0,...,0,0,0,5,0,0,0,0,0,0
1928,1,0,0,0,0,0,0,0,0,0,...,0,0,0,4,0,0,0,0,0,0
1929,1,0,0,0,0,0,0,0,0,0,...,0,0,0,5,0,0,0,0,0,0


In [16]:
#columns = df.columns.to_list()
#scaler = StandardScaler()
#df = scaler.fit_transform(df.values)
#df = pd.DataFrame(df, columns=columns)
#df

In [17]:
#from pyod.models.ecod import ECOD
#from pyod.models.knn import KNN
from pyod.models.iforest import IForest

df_results_pred = pd.DataFrame()
df_results_scores = pd.DataFrame()

models_dict = {
    #'ECOD': ECOD(),
    #'KNN': KNN(),
    'IForest': IForest(),
}

for model_name, model in models_dict.items():
    print(f'Fitting {model_name}...')
    model.fit(df.values)
    predictions = model.predict(df.values)
    scores = model.decision_scores_
    df_results_pred[model_name+"_p"] = predictions
    df_results_scores[model_name+"_s"] = scores

Fitting IForest...


In [18]:
# get the number of outliers (where all models agree)
df_results_pred['sum'] = df_results_pred.sum(axis=1)
df_results_pred['outlier'] = df_results_pred['sum'] == len(models_dict)
df_results_pred

Unnamed: 0,IForest_p,sum,outlier
0,0,0,False
1,0,0,False
2,0,0,False
3,0,0,False
4,0,0,False
...,...,...,...
1926,0,0,False
1927,0,0,False
1928,0,0,False
1929,0,0,False


In [19]:
df_results_pred["sum"].value_counts()

sum
0    1740
1     191
Name: count, dtype: int64

In [20]:
# normalize df with min-max normalization (esclude label column)
#df_results_scores = (df_results_scores - df_results_scores.min()) / (df_results_scores.max() - df_results_scores.min())
# fill NaN values with 0
df_results_scores = df_results_scores.fillna(0)
df_results_scores["average_score"] = df_results_scores.mean(axis=1)
df_results_scores["average_score"] = df_results_scores["average_score"].round(3)
df_results_scores

Unnamed: 0,IForest_s,average_score
0,-0.150288,-0.150
1,-0.148937,-0.149
2,-0.150288,-0.150
3,-0.150288,-0.150
4,-0.150288,-0.150
...,...,...
1926,-0.146530,-0.147
1927,-0.146530,-0.147
1928,-0.136505,-0.137
1929,-0.146530,-0.147


In [21]:
outliers_index = df_results_pred[df_results_pred['outlier'] == True].index
df_results_scores = df_results_scores.loc[outliers_index]
df_results_scores

Unnamed: 0,IForest_s,average_score
137,0.022751,0.023
149,0.044559,0.045
150,0.044559,0.045
151,0.093820,0.094
164,0.022751,0.023
...,...,...
1779,0.004953,0.005
1791,0.004953,0.005
1798,0.023397,0.023
1802,0.004953,0.005


In [22]:
# plot the average score for each label with plotly
fig = px.scatter(df_results_scores, x=df_results_scores.index,
                  y="average_score",
                  title="Average score for each label", size="average_score", size_max=20, opacity=0.5)
fig.show()

In [23]:
df.iloc[1539][df.iloc[1539]!=0]

person    1
chair     2
remote    3
book      8
Name: 1539, dtype: int64

In [24]:
# create movin average column of average_score
df_results_scores['mov_avg'] = df_results_scores['average_score'].rolling(50).mean()
df_results_scores

Unnamed: 0,IForest_s,average_score,mov_avg
137,0.022751,0.023,
149,0.044559,0.045,
150,0.044559,0.045,
151,0.093820,0.094,
164,0.022751,0.023,
...,...,...,...
1779,0.004953,0.005,0.03654
1791,0.004953,0.005,0.03596
1798,0.023397,0.023,0.03606
1802,0.004953,0.005,0.03582


In [25]:
# plot the moving average score for each label with plotly using line chart
fig = px.line(df_results_scores.iloc[50:], x=df_results_scores.iloc[50:].index,
                  y="mov_avg",
                  title="Moving average score")
fig.show()

In [26]:
# save model to pickle
import pickle
filename = 'iforest_model_detected.pkl'
with open(filename, 'wb') as file:
    pickle.dump(models_dict['IForest'], file)
#pickle.dump(models_dict['IForest'], open(filename, 'wb'))