In [None]:
from datetime import datetime, timedelta
from tokenize import group

import polars as pl
from matplotlib import pyplot as plt

#Lectura de los datos
data = pl.read_csv("../data/data.csv")
date_format = "%Y-%m-%d %H:%M:%S"
data = data.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format=date_format),
                         pl.col("Motor_current").map_elements(lambda x: 0 if x < 0.05 else 1).cast(pl.Float64).alias(
                             "Motor_State"))

data.head(5)

In [None]:
data.select(pl.col(pl.Float64)).describe()

In [None]:
anomalys = [
    (datetime(2020, 4, 12, 11, 50, 0), datetime(2020, 4, 12, 23, 30, 0)),
    (datetime(2020, 4, 18, 0, 0, 0), datetime(2020, 4, 18, 23, 59, 0)),
    (datetime(2020, 4, 19, 0, 0, 0), datetime(2020, 4, 19, 1, 30, 0)),
    (datetime(2020, 4, 29, 3, 20, 0), datetime(2020, 4, 29, 4, 0, 0)),
    (datetime(2020, 4, 29, 22, 0, 0), datetime(2020, 4, 29, 22, 20, 0)),
    (datetime(2020, 5, 13, 14, 0, 0), datetime(2020, 5, 13, 23, 59, 0)),
    (datetime(2020, 5, 18, 5, 0, 0), datetime(2020, 5, 18, 5, 30, 0)),
    (datetime(2020, 5, 19, 10, 10, 0), datetime(2020, 5, 19, 11, 0, 0)),
    (datetime(2020, 5, 20, 0, 0, 0), datetime(2020, 5, 20, 0, 0, 0)),
    (datetime(2020, 5, 23, 9, 50, 0), datetime(2020, 5, 23, 10, 10, 0)),
    (datetime(2020, 5, 29, 23, 30, 0), datetime(2020, 5, 29, 23, 59, 0)),
    (datetime(2020, 5, 30, 0, 0, 0), datetime(2020, 5, 30, 6, 0, 0)),
    (datetime(2020, 6, 1, 15, 0, 0), datetime(2020, 6, 1, 15, 40, 0)),
    (datetime(2020, 6, 3, 10, 0, 0), datetime(2020, 6, 3, 11, 0, 0)),
    (datetime(2020, 6, 5, 10, 0, 0), datetime(2020, 6, 5, 23, 59, 0)),
    (datetime(2020, 6, 6, 0, 0, 0), datetime(2020, 6, 6, 23, 59, 0)),
    (datetime(2020, 6, 7, 0, 0, 0), datetime(2020, 6, 7, 14, 30, 0)),
    (datetime(2020, 7, 8, 17, 30, 0), datetime(2020, 7, 8, 21, 0, 0)),
    (datetime(2020, 7, 15, 14, 30, 0), datetime(2020, 7, 15, 19, 0, 0)),
    (datetime(2020, 7, 17, 4, 30, 0), datetime(2020, 7, 17, 5, 30, 0)),
]

raros = [
    #NO APARECE COMO ANOMALIA
    (datetime(2020, 3, 6, 21, 42, 15), datetime(2020, 3, 6, 23, 14, 16)),
    (datetime(2020, 3, 11, 5, 15, 10), datetime(2020, 3, 11, 6, 25, 33)),
    (datetime(2020, 3, 12, 0, 15, 56), datetime(2020, 3, 12, 11, 59, 24)),
    (datetime(2020, 3, 26, 4, 0, 20), datetime(2020, 3, 26, 5, 20, 57)),
    (datetime(2020, 3, 27, 7, 12, 0), datetime(2020, 3, 27, 12, 1, 18)),
    (datetime(2020, 4, 17, 8, 50, 28), datetime(2020, 4, 17, 23, 59, 36)),
    (datetime(2020, 4, 25, 0, 7, 15), datetime(2020, 4, 25, 1, 10, 51)),
    (datetime(2020, 5, 19, 1, 35, 28), datetime(2020, 5, 19, 2, 40, 33)),
    (datetime(2020, 6, 12, 1, 41, 7), datetime(2020, 6, 12, 17, 6, 6)),
    (datetime(2020, 7, 21, 13, 32, 48), datetime(2020, 7, 21, 22, 3, 16)),
    (datetime(2020, 7, 22, 6, 40, 46), datetime(2020, 7, 22, 13, 10, 37)),
    (datetime(2020, 7, 31, 0, 57, 33), datetime(2020, 7, 31, 2, 9, 4))
]

anomalys.extend(raros)

In [None]:
def is_anomaly(instance_date, anomalys_intevals: list[datetime]):
    flag_anomaly = False
    index = 0
    while not flag_anomaly and index < len(anomalys_intevals):
        t = anomalys[index]
        if instance_date > t[0] and instance_date < t[1]:
            flag_anomaly = True
        index += 1

    return flag_anomaly


data = data.with_columns(pl.col("timestamp").map_elements(lambda x: is_anomaly(x, anomalys)).alias("is_anomaly"))

In [None]:
data_no_anomaly = data.with_columns(
    (pl.col("is_anomaly").shift(-1)).alias("next_is_anomaly"),
).with_row_count("Id")

data_no_anomaly = data_no_anomaly.with_columns(
    (pl.col("is_anomaly") != pl.col("next_is_anomaly")).alias("group")
)

data_no_anomaly = data_no_anomaly.with_columns(
    pl.col("group").cumsum().name.keep()
)

data_no_anomaly = data_no_anomaly.filter(pl.col("group") % 2 == 0)

#----
data_no_jumps = data_no_anomaly.with_columns(
    (pl.col("timestamp").shift(1)).alias("last_date"),
    (pl.col("group").shift(1)).alias("last_group"),
)

data_no_jumps = data_no_jumps.filter(pl.col("group") == pl.col("last_group")).with_columns(
    (pl.col("timestamp") - pl.col("last_date")).alias("tiempo_salto"))

data_no_jumps = data_no_jumps.with_columns(
    (pl.col("tiempo_salto") > timedelta(minutes=1)).alias("salto")
)

data_no_jumps = data_no_jumps.with_columns(
    pl.col("salto").cumsum().alias("salto_cumsum")
)

data_no_jumps = data_no_jumps.with_columns(
    (pl.col("group") + pl.col("salto_cumsum")).alias("group")
)

#------
data_cicles = data_no_jumps.with_columns(
    (pl.col("Motor_State").shift(-1)).alias("next_motor_state"),
).filter(
    (pl.col("Motor_State") == 0) & (pl.col("next_motor_state") == 1)
)

data_cicles = data_cicles.with_columns(
    (pl.col("timestamp").shift(1)).alias("last_moment"),
    (pl.col("group").shift(1)).alias("last_group"),
)

data_cicles = data_cicles.filter(pl.col("group") == pl.col("last_group")).with_columns(
    (pl.col("timestamp") - pl.col("last_moment")).alias("tiempo_ciclo"))

data_cicles.filter(pl.col("tiempo_ciclo") == pl.col("tiempo_ciclo").max())

tiempo_medio_ciclos = data_cicles.select("tiempo_ciclo").median().row(0)[0]
# data_cicles.filter(pl.col("tiempo_ciclo") > timedelta(minutes=60)).select(
#     "last_moment","timestamp","Id","is_anomaly"
# )
tiempo_medio_ciclos


In [None]:
data_no_anomaly.filter(
    pl.col("timestamp").is_between(datetime(2020, 4, 29, 3, 25, 19), datetime(2020, 4, 29, 4, 32, 13))
)

In [None]:
import seaborn as sns
import plotly.express as px


def plot_ts(data_to_plot: pl.DataFrame, columns: str, time_window: tuple[int],
            title="Series temporales"):
    """
    Funcion que permite plotear series temporales en distintos rangos de tiempo. Las series aparecen superpuestas y en formato HTML que permite hacer zoom.
    
    ...
    
    Attributes
    ---
        :param data_to_plot: Datos que plotear
        :param columns: nombre de las columnas a seleccionar
        :param time_window: instantes que se quieren plotear 
        :param title: titulo del graficp
        
    Returns
    ---
        Gráfico con las series temporales
    """

    start_time, end_time = time_window
    data_filtered = data_to_plot.filter(pl.col('timestamp').is_between(start_time, end_time))

    # Crear una gráfica con Plotly para interactividad
    plt.figure(1)
    fig = px.line(data_filtered, x="timestamp", y=columns, title=title)
    fig.show()


plot_ts(data, columns=["TP3", "Motor_current", "MPG", "COMP", "DV_eletric"], time_window=anomalys[1])

In [None]:
from sklearn.preprocessing import StandardScaler

Scaler = StandardScaler(with_mean=True, with_std=False)
Scaler.fit(X=data.select(pl.col(pl.Float64)))
data_scaled = pl.DataFrame(Scaler.transform(data.select(pl.col(pl.Float64))))
col_names = data.select(pl.col(pl.Float64)).columns
data_scaled.columns = col_names
data_scaled = data.select(pl.exclude(col_names)).hstack(data_scaled)

In [None]:

import datetime

data_day_one = data_scaled.filter(pl.col("timestamp") < datetime.datetime(2020, 2, 2, 0, 0, 0))
plot_ts(data_day_one, columns=["TP3", "Motor_current", "MPG", "COMP", "DV_eletric"], time_window=data_day_one.shape)


In [None]:

for a in raros[0:2]:
    data_anomaly = data.filter(pl.col("timestamp").is_between(
        a[0],
        a[1],
    ))

    size = min(data_anomaly.shape[0], 1024)

    plot_ts(data, columns=["TP3", "Motor_current", "MPG", "COMP", "DV_eletric"],
            time_window=a)


In [None]:
data_cicles.select("timestamp", "Motor_current", "Motor_State", "next_motor_state")

In [None]:
data_anomaly = data_scaled.filter(
    pl.col("timestamp").is_between(datetime.datetime(2020, 6, 7, 00, 00, 00), datetime.datetime(2020, 6, 7, 14, 30, 0)))
plot_ts(data_anomaly, columns=["TP3", "Motor_current", "MPG", "COMP", "DV_eletric"], time_window=data_anomaly.shape)


In [None]:
df = data_scaled.with_columns(pl.col("timestamp").diff().cumsum().alias("diff"))
df = df.with_columns((pl.col("diff").dt.seconds() // (tiempo_medio_ciclos.total_seconds() * 2)).alias("id"))
df = df.with_columns(pl.coalesce(pl.col("id").cast(pl.UInt64), 0).alias("id"))

start = 50000
end = start + 5000
px.line(df.filter(pl.col("is_anomaly") == True)[:], x="timestamp", y="Motor_current", color="id")

In [None]:
a = ['TP2',
     'TP3',
     'H1',
     'DV_pressure',
     'Reservoirs',
     'Oil_temperature',
     'Motor_current',
     'COMP',
     'DV_eletric',
     'Towers',
     'MPG',
     'LPS',
     'Pressure_switch',
     'Oil_level',
     'Caudal_impulses',
     'Motor_State']

df_X = df.select("id").group_by("id").first()

for c in a:
    df_aux = df.group_by("id").agg([
        pl.col(f"{c}").mean().alias(f"{c}_mean"),
        pl.col(f"{c}").mean().alias(f"{c}_max"),
        pl.col(f"{c}").mean().alias(f"{c}_min"),
        pl.col(f"{c}").mean().alias(f"{c}_median")
    ])
    
    df_X = df_X.join(df_aux,on="id",how="left")

df_X = df_X.join(df.filter(pl.col("is_anomaly") == True).group_by("id").first().select("id","is_anomaly"),how="left",on="id")

df_X = df_X.with_columns(pl.col("is_anomaly").fill_null(0))
df_X.write_csv("datos_procesados.csv")

In [None]:
#Separamos anomalias de no anomalias
df_no_anomaly = df_X.filter(pl.col("is_anomaly") == False)
df_anomaly = df_X.filter(pl.col("is_anomaly") == True)

#Agrupamos las que pertenecen a una misma anomalia
df_anomaly = df_anomaly.sort("id").with_columns(pl.col("id").shift(1).fill_null(0).alias("last_id"))

df_anomaly = df_anomaly.with_columns(((pl.col("id")-pl.col("last_id")) > 2).alias("group"))

df_anomaly = df_anomaly.with_columns(pl.col("group").cumsum())
df_anomaly = df_anomaly.join(df_anomaly.select("id","group").group_by("group").count(),
                             on="group",how="inner")

#Creamos grupos de los grupos con un mismo tamaño para un k prefijado
k = 3
test_anomaly_size = int(df_anomaly.shape[0] / k)
df_groups = df_anomaly.group_by("group").first().select("group","count")
df_groups = df_groups.sample(fraction=1,seed=0)

groups = []
for i in df_groups.to_dicts():
    index = 0
    flag_assigned = False
    while index < len(groups) and not flag_assigned:
        if groups[index]["count"] + i["count"] < test_anomaly_size:
            groups[index]["group"].append(i["group"])
            groups[index]["count"] += i["count"]
            flag_assigned = True
            index = 0
        else:
            index += 1
    if not flag_assigned and len(groups) >= index:
        if len(groups) < k:
            groups.append({"count":i["count"],"group":[i["group"]]})
        else:
            min_size = min([a["count"] for a in groups])
            for index in range(0,k):
                if groups[index]["count"] == min_size:
                    groups[index]["group"].append(i["group"])
                    groups[index]["count"] += i["count"]
                    flag_assigned = True
                    index = 0
                    
groups

In [None]:
#Repetimos el proceso para las no anomalias
df_no_anomaly = df_no_anomaly.sort("id").with_columns(pl.col("id").shift(1).fill_null(0).alias("last_id"))

df_no_anomaly = df_no_anomaly.with_columns(((pl.col("id")-pl.col("last_id")) > 2).alias("group"))

df_no_anomaly = df_no_anomaly.with_columns(pl.col("group").cumsum())
df_no_anomaly = df_no_anomaly.join(df_no_anomaly.select("id","group").group_by("group").count(),
                             on="group",how="inner")

#Creamos grupos de los grupos con un mismo tamaño para un k prefijado
k = 3
test_anomaly_size = int(df_no_anomaly.shape[0] / k)
df_groups_no_anomaly = df_no_anomaly.group_by("group").first().select("group","count")
df_groups_no_anomaly = df_groups_no_anomaly.sample(fraction=1,seed=0)

groups_no_anomaly = []
for i in df_groups_no_anomaly.to_dicts():
    index = 0
    flag_assigned = False
    while index < len(groups_no_anomaly) and not flag_assigned:
        if groups_no_anomaly[index]["count"] + i["count"] < test_anomaly_size:
            groups_no_anomaly[index]["group"].append(i["group"])
            groups_no_anomaly[index]["count"] += i["count"]
            flag_assigned = True
            index = 0
        else:
            index += 1
    if not flag_assigned and len(groups_no_anomaly) >= index:
        if len(groups_no_anomaly) < k:
            groups_no_anomaly.append({"count":i["count"],"group":[i["group"]]})
        else:
            min_size = min([a["count"] for a in groups_no_anomaly])
            for index in range(0,k):
                if groups_no_anomaly[index]["count"] == min_size:
                    groups_no_anomaly[index]["group"].append(i["group"])
                    groups_no_anomaly[index]["count"] += i["count"]
                    flag_assigned = True
                    index = 0


In [None]:
split = 0
df_X = df_X.with_columns(pl.lit(None).alias("split"))
for a,b in zip(groups,groups_no_anomaly):
    split_ids = df_anomaly.filter(pl.col("group").is_in(a["group"]))["id"].to_list() + df_no_anomaly.filter(pl.col("group").is_in(b["group"]))["id"].to_list()
    print(split_ids[-1])
    df_X = df_X.with_columns(pl.when((pl.col("id").is_in(split_ids))).then(split).otherwise(pl.col("split")).alias("split"))
    split += 1

In [None]:
df_X.group_by("split","is_anomaly").count().sort("split","is_anomaly")

In [179]:
df_X.write_csv("datos_procesados.csv")