# IMPORT LIB

In [1]:
import import_ipynb
from AE_LIB import *

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)


import seaborn as sns

import matplotlib.style as mplstyle  # ask Kudret for optimizations
mplstyle.use('fast')

ModuleNotFoundError: No module named 'import_ipynb'

# EXTRA FUNCS

In [None]:
def get_aggregated_df(anomaly_df,time_column,error_column_names,statiscital_func_list):

    """
    Returns a dataframe of values containing an output for each of the statistical functions in the statistical function list , 
    Arguments:
        anomaly_df: Dataframe that contains the info for one cycle
        time_column: which column of the parquet file will be transformed to a date time and for sorting
        error_column_names: List of columns for which the statistical functions should be applied 
        statiscital_func_list: list of statistical functions ex:"sum","mean","median","std","min","max","var","sem","skew","kurt".
        These functions are defined in the scikitlearn functions. 
        Custom functions can also be written and the name has to be passed to it.
        

    Returns:
        Returns a fresh dataframe with the select column names and selected functions. New columns are named  
    """
    aggregated_df=(anomaly_df[error_column_names].dropna(axis=0, how='any',inplace=False)).aggregate(statiscital_func_list)
    aggregated_df.loc["max_min_diff"]=aggregated_df.loc["max"]-aggregated_df.loc["min"] # could also have been defined as s separate function
    aggregated_df["cycle_num"]=anomaly_df["cycle_num"].iloc[0]
    aggregated_df["start"]= anomaly_df[time_column].iloc[0]
    aggregated_df["end"]= anomaly_df[time_column].iloc[-1]
    return aggregated_df

In [None]:
def save_figure(bucket,key,save_format):
    """
    Save figures to buckets using keys and a file format, 
    Arguments:
        bucket: Name of S3 bucket
        key: key in S3
        save_format: File type of the saved figure. i.e pdf...
        

    Returns:
       Does not have a return value, saves figure in the specified place
    """
    
    print(bucket,key)
    img_data = io.BytesIO()
    fig.savefig( img_data, format=save_format)
    img_data.seek(0)
    s3 = bt.resource('s3')
    bucket = s3.Bucket(bucket)
    bucket.put_object(Body=img_data, ContentType='image/{}'.format(save_format), Key=key)

# 1) READ MODEL INFORMATION TABLE

In [None]:
all_model_info=pd.read_parquet("all_models_info_table_v2.parquet")
all_model_info

# 2) CHOOSE

### 2.1) CHOOSE ONE ROW FROM MODEL INFO TABLE 

In [None]:
wanted_features_dictt={}
dictt={
    "cust":"nurol",
    "country":None,
    "location":None,
    "asset":None,
    "subasset":None,
    "device_type":None,
    "ID":None,
    "window_size":128,
    "sample_time":None,
    "rolling_window_time":None,
    "bottleneck":None,
    "wideneck":None,
    "scaler_type":None,
    "layers_type":None,
    "column_num":None,
    "loss_type":None,
    "epoch_num":None,
    "total_process_num":None,
    "date_of_save":None,
    "train_data_note":None,
    "tools_path":None,
    "model_name":None,
    "scaler_name":None,
    "is_live":None,
    "author":None
    }
for a,b in dictt.items():
    if b!=None:
        print(a,b)
        wanted_features_dictt.update({a:b})

if  len(wanted_features_dictt)>0:
    mask=[(all_model_info[feature_key]==feature_value) for feature_key,feature_value in wanted_features_dictt.items()]
    model_info_with_wanted_features=all_model_info.loc[sum(mask)==len(wanted_features_dictt.items())]
    
model_info_with_wanted_features

In [None]:
one_model_info_df=model_info_with_wanted_features.iloc[0:1]
one_model_info_df

# 3) GET OPTIONS OF CHOOSEN MODEL

In [None]:
# Convert df to dictionary
# Insert additional features such as model
train_info_dict=get_train_info_dict(one_model_info_df)


# get the subasset path from train info dict
cust_features_dict=get_cust_features(train_info_dict["read_path"])


#extract error column names list
error_column_names=[train_info_dict["error_column_template"].format(col,train_info_dict["loss_type"]) for col in train_info_dict["train_cols"]]
# generate the reconstructed colum names list
reconstructed_column_names=[train_info_dict["reconstructed_column_template"].format(col) for col in train_info_dict["train_cols"]]

# 4) PARAMETERS

In [None]:
#read_path_prefix = "s3://test-data-eng/anomaly_detection/autoencoder/outputs/"

# WHERE WE WILL READ THE DATA
read_path_prefix = "s3://test-data-eng/anomaly_detection/autoencoder/outputs_teoman/"

#customer
cust_features_column_names=["cust","country","location","asset","subasset"]# "device_type","ID" are canceled

# statistical functions
statiscital_func_list=["sum","mean","median","std","min","max","var","sem","skew","kurt"]

add_aggregated_df_to_list=True  # get aggregated summary for each cycle
add_anomaly_df_to_list=False    # do we want the aggregated anomaly values


# 5) CREATE STATISTICAL DF

In [None]:
anomaly_dfs=[]
aggregated_dfs=[]


read_path=read_path_prefix+cust_features_dict["cust_path"]+train_info_dict["model_name"]
file_paths = wr.s3.list_objects(read_path)

# Concate each of the parquet files for a single cycle, i.e. one shift, one day, one month, one operation cycle
for file_path in file_paths:
    anomaly_df=pd.read_parquet(file_path)
    try:
        anomaly_df.sort_values(train_info_dict["time_column"],inplace=True)
        anomaly_df.reset_index(inplace=True,drop=True)            
        aggregated_df=get_aggregated_df(anomaly_df,train_info_dict["time_column"],error_column_names,statiscital_func_list)
        
        if add_aggregated_df_to_list==True:
            aggregated_dfs.append(aggregated_df)
        if add_anomaly_df_to_list==True:
            anomaly_dfs.append(anomaly_df)
    except:
        None
        
    # concatenate the multiple dataframes
if add_aggregated_df_to_list==True:
    aggregated_df=pd.concat(aggregated_dfs)
if add_anomaly_df_to_list==True:
    anomaly_df=pd.concat(anomaly_dfs)


In [None]:
cycles_statistics=pd.concat(aggregated_dfs)

In [None]:
cycles_statistics

In [None]:
cycles_statistics=cycles_statistics.drop_duplicates() # absurd data may generate multiple of the same rows

In [None]:
cycles_statistics.sort_values("start",inplace=True)

cycles_statistics=cycles_statistics.iloc[528:]

In [None]:
statiscital_func_list=["max_min_diff"]+statiscital_func_list # could also have been defined as a separate function

# 6) READ MAINTENANCE DF AND RECIPE DF (OPTIONAL-NUROL SPECIFIC )

In [None]:
maintenance_info=pd.read_parquet("s3://test-data-eng/anomaly_detection/autoencoder/sources/preprocessed/nurol/tr/golbasi/fct2/fct2_maintenance_infos.parquet")
maintenance_info=maintenance_info.loc[maintenance_info["Giriş tarihi"]<=maintenance_info["Fiili btş.trm."]]
maintenance_info=maintenance_info.groupby("Fiili btş.trm.").aggregate({"Kısa metin":"min"})
maintenance_info.reset_index(inplace=True)
maintenance_info

In [None]:
#project name and recipe used
used_recipesand_projects=pd.read_parquet("s3://test-data-eng/anomaly_detection/autoencoder/sources/preprocessed/nurol/tr/golbasi/fct2/fct2-used_recipes.parquet")
used_recipesand_projects["used_recipe"].dropna(inplace=True)
transitions_recipes=[]

tmp=None
for i in range(len(used_recipesand_projects)):
    row_recipe=used_recipesand_projects.iloc[i]["used_recipe"]
    if tmp!=row_recipe:
        transitions_recipes.append(used_recipesand_projects.iloc[i:i+1])
    tmp=used_recipesand_projects.iloc[i]["used_recipe"]

transitions_projects=[]
tmp=None    
for i in range(len(used_recipesand_projects)):
    row_project=used_recipesand_projects.iloc[i]["project"]
    if tmp!=row_project:
        transitions_projects.append(used_recipesand_projects.iloc[i:i+1])
    tmp=used_recipesand_projects.iloc[i]["project"]    
    
transitions_recipes =pd.concat(transitions_recipes)
transitions_projects =pd.concat(transitions_projects)
transitions_projects

# 7) PRINT PLOTS 

In [None]:
plot_save_format="pdf"
plot_save_bucket="test-data-eng"
#plot_save_prefix="anomaly_detection/autoencoder/outputs_vizualization/concat_1min_kia_multi_column_model_all_subassets.pdf"
plot_save_path_prefix="anomaly_detection/autoencoder/vizualization_plots/"+cust_features_dict["cust_path"]+train_info_dict["model_name"]+"/"
plot_name= time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())



figsize_x=20
figsize_y=12
plot_quantile_threshold=0.9

plt.rcParams["figure.figsize"] = (figsize_x,figsize_y)
fig, ax = plt.subplots(len(error_column_names),len(statiscital_func_list),figsize=(figsize_x*len(statiscital_func_list), figsize_y*len(error_column_names)))

In [None]:
# create a plot in the local folder and save to s3

colors =plt.rcParams['axes.prop_cycle'].by_key()['color']
if len(ax.shape)==1: 
    ax=ax.reshape(1,ax.shape[0]) #if we works with just single column we need this

for i,statistical_func in enumerate(statiscital_func_list):
    for j,col in enumerate(error_column_names):
        maes=cycles_statistics[cycles_statistics.index==statistical_func][error_column_names[j]]
        #plot_twinx(maes,maes,cycles_statistics.start.unique(),train_cols[j]+"--"+statistical_func,False,bucket="data-science-source",key="Anomaly_detection/Autoencoder/png files/{}/{}".format(statiscital_func_list,train_cols[j]),plot_vertical_line=True,vertical_line_points_df=z[date_columns])
        ax[j,i].bar(cycles_statistics.start.unique(),maes, color = colors[1])
        ax[j,i].plot(cycles_statistics.start.unique(),maes, color = colors[0])
        ax[j,i].set_ylim(maes.min(),maes.quantile(plot_quantile_threshold))
        
        try:
            for k in range(len(maintenance_info)):
                if k==0:
                    ax[j,i].axvline(x = maintenance_info["Fiili btş.trm."].iloc[k],label="maintenance finish date", color = colors[2],alpha=0.2)
                else:
                    ax[j,i].axvline(x = maintenance_info["Fiili btş.trm."].iloc[k], color = colors[2],alpha=0.2)
        except:
            None
        try:
            
            for k in range(len(transitions_recipes)):
                if k==0:
                    ax[j,i].axvline(x = transitions_recipes["start"].iloc[k], label="change date of recipe",color = colors[3],alpha=0.2) 
                else:
                    ax[j,i].axvline(x = transitions_recipes["start"].iloc[k], color = colors[3],alpha=0.2)
        except:
            None
        for k in range(len(transitions_projects)):
            if k==0:
                ax[j,i].axvline(x = transitions_projects["start"].iloc[k], label="change date of project", color = colors[6],alpha=0.2)
            else:
                ax[j,i].axvline(x = transitions_projects["start"].iloc[k],  color = colors[6],alpha=0.2)
    
        ax[j,i].axvline(x = pd.to_datetime("2020-11-08"),  color = "red",alpha=0.8,label="2020-11-08")
        ax[j,i].axvline(x = pd.to_datetime("2023-02-22"),  color = "red",alpha=0.8,label="2023-02-22")
        
        
        ax[j,i].set_title(error_column_names[j]+"--"+statistical_func)
        ax[j,i].set_xlabel('time')
        ax[j,i].set_ylabel('mse_error')
        ax[j,i].legend()
        
fig.savefig( train_info_dict["model_name"]+"_"+plot_name+"."+plot_save_format)
save_figure(plot_save_bucket,plot_save_path_prefix+plot_name+"."+plot_save_format,plot_save_format)