In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from adtk.visualization import plot
from adtk.data import validate_series

In [None]:
data = pd.read_csv('TEMP_selected.csv', names=['muestra','temp'], delimiter=';')
data = data.drop(data.index[[0]])
data.set_index(
    pd.PeriodIndex(np.array(data.muestra), freq="10min"),
    inplace=True,
)

In [None]:
data.drop("muestra", axis=1, inplace=True)

data['temp']= data['temp'].str.replace(',','.')
data['temp']= data['temp'].astype(float)

data.index = data.index.to_timestamp()

In [None]:
data['temp'] = data['temp'].fillna(0)

print(data['temp'].isnull().sum())

plot(data)

orig_data = data.copy()

## Árboles de Decisión: Isolation Forest

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

data = orig_data.copy()
orig_index = data.index

outlier_frac= 0.05

scaler = StandardScaler()

np_scaled = scaler.fit_transform(data.values.reshape(-1,1))
data = pd.DataFrame(np_scaled, index=orig_index, columns=['temp'])

model = IsolationForest(contamination=outlier_frac)
model.fit(data)

data['anomaly'] = model.predict(data)

fig, ax = plt.subplots(figsize=(10,6))
a = data.loc[data['anomaly'] == -1, ['temp']] #anomaly
ax.plot(data.index, data['temp'], color='black', label = 'Normal')
ax.scatter(a.index,a['temp'], color='red', label = 'Anomaly')
plt.legend()
plt.show()

## Predicción: fbprophet

In [None]:
from fbprophet import Prophet

def detect_anomalies(forecast):
    forecasted = forecast[['ds','trend', 'yhat', 'yhat_lower', 'yhat_upper', 'fact']].copy()

    forecasted['anomaly'] = 0
    forecasted.loc[forecasted['fact'] > forecasted['yhat_upper'], 'anomaly'] = 1
    forecasted.loc[forecasted['fact'] < forecasted['yhat_lower'], 'anomaly'] = -1

    #anomaly importances
    forecasted['importance'] = 0
    forecasted.loc[forecasted['anomaly'] ==1, 'importance'] = \
        (forecasted['fact'] - forecasted['yhat_upper'])/forecast['fact']
    forecasted.loc[forecasted['anomaly'] ==-1, 'importance'] = \
        (forecasted['yhat_lower'] - forecasted['fact'])/forecast['fact']
    
    return forecasted

data = pd.DataFrame()
data['ds'] = orig_data.index
data['y'] = orig_data.values

interval_width = 0.99
changepoint_range = 0.95

m = Prophet(daily_seasonality = False, yearly_seasonality = False,
            weekly_seasonality = False, seasonality_mode = 'additive',
            interval_width = interval_width,
            changepoint_range = changepoint_range)

m = m.fit(data)
pred = m.predict(data)
pred['fact'] = data['y'].reset_index(drop = True)

pred = detect_anomalies(pred)

print(np.count_nonzero(pred['anomaly'].values))

norm = pred[['ds','trend', 'yhat', 'yhat_lower', 'yhat_upper']].copy()

m.plot(norm)

fig, ax = plt.subplots(figsize=(10,6))
a = pred.loc[pred['anomaly'] != 0,['ds','fact']] #anomaly
ax.scatter(pred['ds'], pred['fact'], color='black', label = 'Normal')
ax.scatter(a['ds'],a['fact'], color='red', label = 'Anomaly')
ax.scatter(pred['ds'],pred['yhat'], color='cyan', label = 'Forecasted')
plt.legend()
plt.show()

## Agrupaciones
### K-Means

In [None]:
from sklearn.cluster import KMeans

def getDistanceByPoint(data, model):
    """ Function that calculates the distance between a point and centroid of a cluster, 
            returns the distances in pandas series"""
    distance = []
    for i in range(0,len(data)):
        Xa = np.array(data.iloc[i][0])
        Xb = model.cluster_centers_[model.labels_[i]-1]
        distance.append(np.linalg.norm(Xa-Xb))
    return pd.Series(distance, index=data.index)

data = orig_data.copy()
outlier_frac = 0.05

n_cluster = range(1,20)

kmeans = [KMeans(n_clusters=i).fit(data.values) for i in n_cluster]
scores = [kmeans[i].score(data.values) for i in range(len(kmeans))]

fig, ax = plt.subplots(figsize=(10,6))
ax.plot(n_cluster, scores)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show();

labels = kmeans[7].predict(data.values)
uniq_elem, count_elem = np.unique(labels, return_counts=True)

clusters = np.asanyarray((uniq_elem,count_elem))

distance = getDistanceByPoint(data, kmeans[7])

num_outlier = int(outlier_frac*len(distance))

threshold = distance.nlargest(num_outlier).min()

data['anomaly'] = (distance >= threshold).astype(int)

fig, ax = plt.subplots(figsize=(10,6))
a = data.loc[data['anomaly'] == 1, ['temp']] #anomaly
ax.plot(data.index, data['temp'], color='black', label = 'Normal')
ax.scatter(a.index,a['temp'], color='red', label = 'Anomaly')
plt.legend()
plt.show()

### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

data = orig_data.copy()

dbscan = DBSCAN(eps=0.05, min_samples=10).fit(data.values)
labels = dbscan.labels_
outlier_pos = np.where(labels == -1)[0] #indice

a = data.iloc[outlier_pos]


fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(data.index, data['temp'], color='black', label = 'Normal')
ax.scatter(a.index, a['temp'], color='red', label = 'Anomaly')
plt.legend()
plt.show()