In [51]:
import plotly.express as px
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go

#import csv
pd.set_option('display.max_rows', None)  # Display all rows


In [52]:
#if not os.path.exists("images"):
#    os.mkdir("images")

dataset = "rfid"

def process_file(file_number, date):
    folder_path = dataset + '_' + date + '/'
    file_path = os.path.join(folder_path, f'rfid_{date}_{file_number}.csv')
    if os.path.exists(file_path):
        #print(file_path)
        if file_number == "final":
            #file_path = os.path.join("rfid_fabrica", f'rfid_final.csv')
            df = pd.read_csv(file_path, parse_dates=['Stamp'])
            if df.shape[1] == 6:
                df = df.drop(df.columns[0], axis=1)
                df = df.drop(df.columns[1], axis=1)
                df = df.drop(df.columns[2], axis=1)


            if df.shape[1] == 4:
                df = df.drop(df.columns[3], axis=1)
                df = df.drop(df.columns[1], axis=1)
            df.to_csv(file_path, index = False)

        else:
            df = pd.read_csv(file_path, parse_dates=['ts'])

            # Remove first empty column if it exists
            if df.shape[1] == 3:
                df = df.drop(df.columns[0], axis=1)
            # Remove entries with 'gabarit' equal to 0
            df= df[df['gabarit'] != 0]

            # Write the updated DataFrame back to the CSV file
            df.to_csv(file_path, index = False)
            # Reset the index of the DataFrame
            df = df.reset_index(drop = True) 
    else:
        return pd.DataFrame

    return df

day = "10"
month = "10"
date = day + "_" + month #change date accordingly
date_year = "2023-" + month + "-" + day

df1 = process_file("1",date)
df2 = process_file("2",date)
df3 = process_file("3",date)
df4 = process_file("final",date)
df4['Model'] = df4['Model'].replace({'SPT130023W': 'Y', 'SPT120018W': 'X', 'SPT140034W': 'Z'})




In [53]:
def remove_top_n_outliers(df, column_name, n):
    # Calculate the z-score for the specified column
    z_scores = (df[column_name] - df[column_name].mean()) / df[column_name].std()

    df_cleaned = df[z_scores < z_scores.nlargest(n).iloc[-1]]

    return df_cleaned

df4['Time_Diff'] = df4['Stamp'].diff().dt.total_seconds()
df4 = remove_top_n_outliers(df4, 'Time_Diff', 1) #lunch break
df4.loc[:,'Time_Diff'] = round(df4['Time_Diff']/60,2)

In [54]:
# calculates time and writes it in dataframe
def calc_time_in_station(df, df_in, index_i, index_f, cnt):
    if index_f - index_i == 0:
        time_in_station = 1
    else: 
        time_in_station = ((df.loc[index_f, 'ts'] - df.loc[index_i, 'ts']).total_seconds() + 1)

    df_in.loc[cnt,'ts'] = df.loc[index_i, 'ts']
    df_in.loc[cnt,'gabarit'] = df.loc[index_i, 'gabarit']
    df_in.loc[cnt,'time_in_station'] = time_in_station

    cnt += 1

    return df_in, cnt


def time_in_station_fun(df):
    """
    Calculates and records the time spent in a station for each occurrence of a 'gabarit' in a DataFrame in seconds.

    Parameters:
        - df: DataFrame containing timestamps and 'gabarit' values.

    Returns:
        DataFrame with an added 'time_in_station' column representing the time
        spent in the station for each occurrence (in seconds).
    """
    
    curr_gabarit = 0
    index_i = 0  # Initialize index_i outside the loop
    index_f = 0  # Initialize index_f outside the loop
    cnt = 0
    df_in = pd.DataFrame(columns=['ts', 'gabarit', 'time_in_station'])
    #df_in['ts'] = pd.to_datetime(df['ts'])

    for i in range(df.shape[0]): #0 to 129
        if i != 0: # not first index
            if ((df.loc[i, 'gabarit'] != curr_gabarit)):
                # routine to present final result
                df_in, cnt = calc_time_in_station(df, df_in, index_i, index_f, cnt)
            
          
            # new gabarit is found, i = f
            if df.loc[i, 'gabarit'] != df.loc[i-1, 'gabarit']:
                curr_gabarit = df.loc[i, 'gabarit']
                index_i = i # starts the interval
                index_f = i
        else: # for first entry, i = f
            curr_gabarit = df.loc[i, 'gabarit']
            index_i = i
            index_f = i

        
        if i != df.shape[0] - 1: # not last index 
            if df.loc[i, 'gabarit'] == df.loc[i+1, 'gabarit']:
                index_f = i+1
        else:
            index_f = i           
            df_in, cnt = calc_time_in_station(df, df_in, index_i, index_f, cnt)

    return df_in


def plot_histogram(df, station,color):
    df = df[df['time_in_station'] != 0]

    df['time_in_station'] = df['time_in_station'] /60
    
    fig2 = px.histogram(df, x='time_in_station', range_x=[0, 2], range_y = [0,25], nbins = 20,
                        labels={'time_in_station': 'Time in Station (minutes)','count': 'Frequency'},
                        title = f'Frequency of gabarit\'s dwell time in station {station}',
                        width = 900, color_discrete_sequence=[color])
    fig2.update_traces(marker=dict(line=dict(width=0.2)))
    fig2.show()

    fig2.write_image("images/histogram_" + dataset + "_" + day + "_" + month + "_" + str(station) + ".pdf")



In [55]:
def remove_outlier_quart(df, value, quantile = 0.1):
    # Calculate the first quartile (Q1) and third quartile (Q3)
    filtered_df = pd.DataFrame
    Q1 = df[value].quantile(quantile)
    Q3 = df[value].quantile(1-quantile)

    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Define the lower and upper bounds to filter outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    if value == 'time_in_station':
        filtered_df = df[(df[value] <= upper_bound)]
    else:
        filtered_df = df[(df[value] >= lower_bound) & (df[value] <= upper_bound)]

    return(filtered_df)

In [56]:
plot = 1
display_df = 0
df_in_1 = time_in_station_fun(df1)
df_in_2 = time_in_station_fun(df2)
df_in_3 = time_in_station_fun(df3)

#
df_in_1_filtered = remove_outlier_quart(df_in_1,'time_in_station', quantile = 0.05)
df_in_2_filtered = remove_outlier_quart(df_in_2,'time_in_station', quantile = 0.05)
df_in_3_filtered = remove_outlier_quart(df_in_3,'time_in_station', quantile = 0.11)
#df_in_1_filtered = df_in_1_filtered[df_in_1_filtered!= 1]
#df_in_2_filtered = df_in_2_filtered[df_in_2_filtered!= 1]
#df_in_3_filtered = df_in_3_filtered[df_in_3_filtered!= 1]

if display_df:
    display(df_in_1.loc[0:50])
    display(df_in_2.loc[0:50])
    display(df_in_3.loc[0:50])

if plot:
    #plot_histogram(df_in_1,1)
    plot_histogram(df_in_1_filtered,1,'darkblue')
    #plot_histogram(df_in_2, station)
    plot_histogram(df_in_2_filtered,2,'dodgerblue')
    #plot_histogram(df_in_3,3)
    plot_histogram(df_in_3_filtered,3,'lightskyblue')



In [57]:
def time_between_station(df1, df2, station1, station2, max_time):
    """
    Finds and records the time difference between occurrences of the same 'gabarit' in two DataFrames,
    df1 and df2, within a specified time window.

    Parameters:
        - df1: DataFrame containing timestamps and 'gabarit' values for station1.
        - df2: DataFrame containing timestamps and 'gabarit' values for station2.
        - station1: Identifier for the first station.
        - station2: Identifier for the second station.
        - max_time: Maximum time window to search for matching occurrences (in minutes).
    
    Returns:
        DataFrame with columns 'gabarit', 'ts' (timestamp), and 'station1_station2'
        representing the time difference (in minutes) between corresponding occurrences in
        station1 and station2 within the specified time window.
    """
    
    df_bet = pd.DataFrame(columns=['gabarit', f'_{station1}_{station2}'])
    df_bet['ts'] = pd.NaT  # initialize as datetime type

 
    if (station1 != 3 and station2 != 4): ##1 2 or 2 3 
        column_order = ['ts', 'gabarit', f'_{station1}_{station2}']
        df_bet = df_bet[column_order]
        df_bet = find_matching_gabarit(df1, df2, station1, station2, max_time, df_bet)
    else:
        df_bet['model'] = ''
        df_bet['state'] = ''
        df_bet['ts_final'] = ''
        column_order = ['ts', 'ts_final', 'gabarit', 'model', f'_{station1}_{station2}','state']
        df_bet = df_bet[column_order]
        df_bet = find_matching_gabarit_final(df1, df2, df_bet)

    return df_bet


def find_matching_gabarit(df1,df2,station1, station2, max_time, df_bet):

    cnt = 0
            
    for index in range(df1.shape[0]):
        start_time = df1.loc[index, 'ts']
        end_time = start_time + pd.Timedelta(minutes=max_time)
        matching_rows = df2[(df2['ts'] >= start_time) & 
                            (df2['ts'] <= end_time) 
                                & (df2['gabarit'] == df1.loc[index,'gabarit'])]
        if not matching_rows.empty:
            first_occurrence = matching_rows.iloc[0]
            
            # df1: initial time stamp, time_in_station_1
            # df2: initial time stamp, time_in_station_2
            if (station1 == 1 and station2 == 2):
                time_bet = (first_occurrence['time_in_station'] +
                        (first_occurrence['ts'] - df1.loc[index, 'ts']).total_seconds()) # in seconds
            else:
                time_bet = ((first_occurrence['ts'] - df1.loc[index, 'ts'] # in seconds
                            - pd.Timedelta(seconds = df1.loc[index, 'time_in_station'])).total_seconds())

    
            df_bet.loc[cnt, 'ts'] = df1.loc[index, 'ts']  # Copy the timestamp for reference
            df_bet.loc[cnt, 'gabarit'] = first_occurrence['gabarit']
            df_bet.loc[cnt, f'_{station1}_{station2}'] = time_bet #round,2
            cnt += 1
        
    return df_bet

def find_matching_gabarit_final(df1, df2, df_bet):

    cnt = 0
            
    for index in range(df1.shape[0]):
        start_time = (df1.loc[index, 'ts'] + pd.Timedelta(seconds = df1.loc[index, 'time_in_station'])
                      + pd.Timedelta(seconds=30)) # not possible to have a test that takes less than 30 seconds
        matching_rows = df2[(df2['Stamp'] >= start_time)]


        if not matching_rows.empty:
            first_occurrence = matching_rows.iloc[0]
            
            # df1: initial time stamp, time_in_station_1
            # df2: initial time stamp, time_in_station_2
            time_bet = ((first_occurrence['Stamp'] - df1.loc[index, 'ts']).total_seconds()) # in minutes

            df_bet.loc[cnt, 'ts'] = df1.loc[index, 'ts']  # Copy the timestamp for reference
            df_bet.loc[cnt, 'ts_final'] = first_occurrence['Stamp']
            df_bet.loc[cnt, 'gabarit'] = df1.loc[index,'gabarit']
            #print(cnt)
            #print(type(first_occurrence['Model']))
            #print(first_occurrence['Model'])
            df_bet.loc[cnt, 'model'] = str(first_occurrence['Model'])
            df_bet.loc[cnt, 'state'] = str(first_occurrence['Status'])
            df_bet.loc[cnt, f'_3_4'] = time_bet
            cnt += 1
        
    return df_bet



display_df_ = 1

df_bet_12 = time_between_station(df_in_1, df_in_2, 1, 2, max_time = 5) #(max time in minutes)
df_bet_12 = df_bet_12[df_bet_12["_1_2"] > 10]
df_bet_12.reset_index(drop=True, inplace=True)

df_bet_23 = time_between_station(df_in_2, df_in_3, 2, 3, max_time = 20)
df_bet_34 = time_between_station(df_in_3, df4, 3, 4, max_time = 0)
df_bet_34['_3_4'] = df_bet_34['_3_4'].astype(int)
df_bet_34 = remove_top_n_outliers (df_bet_34,'_3_4',3)

if display_df_:
    display(df_bet_12.loc[:20])
    display(df_bet_23.loc[:20])
    display(df_bet_34.loc[:20])





Unnamed: 0,ts,gabarit,_1_2
0,2023-10-10 09:08:20,24,34.0
1,2023-10-10 09:30:29,5,198.0
2,2023-10-10 09:33:00,6,76.0
3,2023-10-10 09:34:15,11,25.0
4,2023-10-10 09:39:10,10,36.0
5,2023-10-10 09:40:18,2,96.0
6,2023-10-10 09:41:55,3,32.0
7,2023-10-10 09:43:09,12,55.0
8,2023-10-10 09:45:45,8,40.0
9,2023-10-10 09:47:17,6,31.0


Unnamed: 0,ts,gabarit,_2_3
0,2023-10-10 09:08:41,24,916.0
1,2023-10-10 09:12:29,11,768.0
2,2023-10-10 09:13:26,22,828.0
3,2023-10-10 09:22:14,10,502.0
4,2023-10-10 09:23:53,3,511.0
5,2023-10-10 09:30:46,5,130.0
6,2023-10-10 09:39:45,10,196.0
7,2023-10-10 09:42:26,3,168.0
8,2023-10-10 09:44:03,12,122.0
9,2023-10-10 09:46:18,8,196.0


Unnamed: 0,ts,ts_final,gabarit,model,_3_4,state
0,2023-10-10 09:24:10,2023-10-10 09:27:53,24,Y,223,OK
1,2023-10-10 09:25:18,2023-10-10 09:27:53,11,Y,155,OK
2,2023-10-10 09:27:20,2023-10-10 09:27:53,22,Y,33,OK
3,2023-10-10 09:30:37,2023-10-10 09:31:15,10,Y,38,OK
4,2023-10-10 09:32:25,2023-10-10 09:34:30,3,Y,125,OK
5,2023-10-10 09:35:57,2023-10-10 09:37:04,5,Y,67,OK
6,2023-10-10 09:36:48,2023-10-10 09:37:59,8,Y,71,OK
7,2023-10-10 09:39:50,2023-10-10 09:42:35,15,Y,165,OK
8,2023-10-10 09:41:06,2023-10-10 09:42:35,19,Y,89,OK
9,2023-10-10 09:43:02,2023-10-10 09:44:55,10,Y,113,OK


In [58]:
def perc_of_gabarits_found(df_in, df4, station):
    gab_scanned = df_in.shape[0]
    gab_factory = df4.shape[0]
    perc = gab_scanned / gab_factory * 100
    perc = round(perc,2)
    
    if station == 4:
        #display(df4)
        #display(df_in) 
        ok_count = df_in['state'].value_counts().get('OK', 0)
        #print(f'ok: {ok_count}')
        ko_count =gab_scanned - ok_count
        perc_ok = round(ok_count / gab_scanned *100)
        perc_ko = round(100 - perc_ok)
        print(f"Percentage of gabarits scanned in every sensor: {perc}%, which of those occured OK:{ok_count} and KO:{ko_count}")
    else: 
        print(f"Percentage of gabarits scanned in sensor {station}: {perc}%")
    
    return perc


perc1 = perc_of_gabarits_found(df_in_1, df4, 1)
perc2 = perc_of_gabarits_found(df_in_2, df4, 2)
perc3 = perc_of_gabarits_found(df_in_3, df4, 3)


Percentage of gabarits scanned in sensor 1: 85.26%
Percentage of gabarits scanned in sensor 2: 45.83%
Percentage of gabarits scanned in sensor 3: 62.82%


In [67]:

def all_time_between_station(df12, df23, df34):
    #df_bet_all = pd.DataFrame(columns = ['ts1','_12','ts2', 'ts2','_23','ts3','time_bet_34','gabarit','model'])
    df_bet_all = pd.DataFrame(columns = ['ts_1','ts_2', 'ts_3','ts_4','gabarit', 'model','_1_2','_2_3','_3_4','state']) 
    #display(df12)
    #display(df23)
    #display(df34)

    cnt = 0
    for index in range(df12.shape[0]):

        start_time = (df12.loc[index, 'ts'] + pd.Timedelta(seconds = df12.loc[index, f'_1_2'])
                       - pd.Timedelta(seconds=2))
        end_time = start_time + pd.Timedelta(seconds=3)

        matching_rows = df23[(df23['ts'] >= start_time) & (df23['ts'] <= end_time) 
                                & (df23['gabarit'] == df12.loc[index,'gabarit'])]
        if not matching_rows.empty:
            
            first_occurrence = matching_rows.iloc[0]

            start_time2 = (first_occurrence['ts'] + pd.Timedelta(seconds = first_occurrence[f'_2_3']) 
                          - pd.Timedelta(seconds=1))
            end_time2 = start_time2 +   pd.Timedelta(seconds=3)
            matching_rows2 = df34[(df34['ts'] >= start_time2) & (df34['ts'] <= end_time2) 
                                & (df34['gabarit'] == first_occurrence['gabarit'])]
            
            if not matching_rows2.empty:

                first_occurrence2 = matching_rows2.iloc[0]
                df_bet_all.loc[cnt, 'ts_1'] = df12.loc[index, 'ts']  # Copy the timestamp for reference
                df_bet_all.loc[cnt, 'gabarit'] = first_occurrence['gabarit']
                df_bet_all.loc[cnt, '_1_2'] = df12.loc[index, '_1_2'] 
                df_bet_all.loc[cnt, '_2_3'] = first_occurrence['_2_3'] 
                df_bet_all.loc[cnt, 'ts_2'] = first_occurrence['ts']
                df_bet_all.loc[cnt, 'ts_3'] = first_occurrence2['ts']
                df_bet_all.loc[cnt, 'ts_4'] = first_occurrence2['ts_final']
                df_bet_all.loc[cnt, '_3_4'] = first_occurrence2['_3_4']
                df_bet_all.loc[cnt, 'model'] = str(first_occurrence2['model'])
                df_bet_all.loc[cnt, 'state'] = str(first_occurrence2['state'])
            
                cnt += 1

    return df_bet_all



def plot_time_in_station(df):
    #display(px.data.medals_wide())

    df['_1_2'] = df['_1_2'].astype(int)
    df['_2_3'] = df['_2_3'].astype(int)
    df['_3_4'] = df['_3_4'].astype(int)
    melted_df = pd.melt(df, id_vars=['ts_1','ts_2','ts_3','ts_4','model','state'], value_vars=['_1_2','_2_3','_3_4'],var_name='process', value_name='time')    
    melted_df['color'] = melted_df['model'] + melted_df['process']
    color_discrete_map = {"Y_1_2": 'darkblue',
                          "Y_2_3": 'dodgerblue', 
                          "Y_3_4": 'lightskyblue',

                          "X_1_2": 'maroon',
                          "X_2_3":'red',
                          "X_3_4": 'lightcoral',

                          "Z_1_2": 'forestgreen',
                          "Z_2_3": 'limegreen', 
                          "Z_3_4": 'lightgreen',}
    
    
    melted_df["time"] = melted_df["time"] /60
    fig = px.bar(melted_df, x="ts_1", y="time", 
                labels={"ts_1": "Timestamp", "time": "Time (minutes)"},
                color = "color",
                color_discrete_map = color_discrete_map,
                title="Time Between Stations")
    fig.update_traces(width=30000)
    
    fig.update_layout(legend_title_text="Model_time between stations")
    #fig.update_layout(legend=dict( yanchor="top", y=0.95,xanchor="left",x=0.70))
    fig.update_layout(yaxis_range=[0, 25], xaxis_range = [date_year + ' 15:00:00', date_year + ' 18:00:00'])#y_max + 1])
    fig.update_xaxes(showgrid=True) 
    fig.update_yaxes(range=[0,10]) 
    fig.update_layout(font=dict(size=14))
    fig.update_layout(width=1050, height = 500) 
    
    fig.write_image("images/time_in_stat_" + dataset + "_" + day + "_" + month + "_" + ".svg")
    display(melted_df.loc[:10])
    average_time_1_2 = melted_df[melted_df['process'] == '_1_2']['time'].mean()
    average_time_2_3 = melted_df[melted_df['process'] == '_2_3']['time'].mean()
    average_time_2_3 += average_time_1_2
    average_time_3_4 = melted_df[melted_df['process'] == '_3_4']['time'].mean()
    average_time_3_4 += average_time_2_3

    fig.add_trace(go.Scatter(x=[melted_df['ts_1'].iloc[0], melted_df['ts_1'].iloc[-1] + pd.to_timedelta("00:10:00")], y=[average_time_1_2, average_time_1_2], mode='lines', 
                             name='Average time process 1', line_color='maroon', line=dict(dash='dot')))
    fig.add_trace(go.Scatter(x=[melted_df['ts_1'].iloc[0], melted_df['ts_1'].iloc[-1] + pd.to_timedelta("00:10:00")], y=[average_time_2_3, average_time_2_3], mode='lines', 
                             name='Average time process 2', line_color='red', line=dict(dash='dot')))
    
    fig.add_trace(go.Scatter(x=[melted_df['ts_1'].iloc[0], melted_df['ts_1'].iloc[-1] + pd.to_timedelta("00:10:00")], y=[average_time_3_4, average_time_3_4], mode='lines', 
                             name='Average time process 3', line_color='lightcoral', line=dict(dash='dot')))
    
    
    fig.show()


def plot_time_horizontal(df):
    #display(px.data.medals_wide())

    df['_1_2'] = df['_1_2'].astype(int)
    df['_2_3'] = df['_2_3'].astype(int)
    df['_3_4'] = df['_3_4'].astype(int)
    melted_df = pd.melt(df, id_vars=['ts_1','ts_2','ts_3','ts_4','model','state'], value_vars=['_1_2','_2_3','_3_4'],var_name='process', value_name='time')    

    melted_df['color'] = melted_df['model'] + melted_df['process']
    #print(melted_df)
    color_discrete_map = {"Y_1_2": 'orangered',
                          "Y_2_3": 'orange', 
                          "Y_3_4": '#FFD68A',

                          "X_1_2": 'darkblue',
                          "X_2_3":'dodgerblue',
                          "X_3_4": 'lightskyblue',

                          "Z_1_2": 'forestgreen',
                          "Z_2_3": 'limegreen', 
                          "Z_3_4": 'lightgreen',}
    
    
    melted_df["time"] = melted_df["time"] /60
    #melted_df = melted_df.reset_index(drop=True)
    #melted_df['index'] = (melted_df.index // 3)

    fig = px.bar(melted_df, x="ts_1", y= "time" , 
                labels={'ts_1': "Timestamp", "time": "Time (minutes)"},
                color = "color",
                color_discrete_map = color_discrete_map,
                title="Time of processes")#, orientation = 'h')
    
    #melted_df["base"] = (melted_df['ts_1'] - melted_df['ts_1'].shift()).dt.total_seconds() / 60
    #melted_df['base'] = melted_df['base'].cumsum()
    #display(melted_df.loc[:40])
    #fig.update_traces(base=melted_df["base"])
    fig.update_traces(width=20000)
    fig.update_layout(legend_title_text="Model_time between stations")
    fig.update_layout(legend=dict( yanchor="top", y=-0.1,xanchor="left",x=0.75))
    fig.update_layout(xaxis_range = [date_year + ' 11:38:00', date_year + ' 12:50:00'])#y_max + 1])
    fig.update_xaxes(showgrid=True) 
    fig.update_yaxes(range=[0,10]) 
    fig.update_layout(font=dict(size=14))
    fig.update_layout(width=1050, height = 600) 
    average_time_1_2 = melted_df[melted_df['process'] == '_1_2']['time'].mean()
    average_time_2_3 = melted_df[melted_df['process'] == '_2_3']['time'].mean()
    average_time_2_3 += average_time_1_2
    average_time_3_4 = melted_df[melted_df['process'] == '_3_4']['time'].mean()
    average_time_3_4 += average_time_2_3

    fig.add_trace(go.Scatter(x=[melted_df['ts_1'].iloc[0], melted_df['ts_1'].iloc[-1] + pd.to_timedelta("00:10:00")], y=[average_time_1_2, average_time_1_2], mode='lines', 
                             name='Average time process 1', line_color='orangered', line=dict(dash='dot'))) #'darkblue'
    fig.add_trace(go.Scatter(x=[melted_df['ts_1'].iloc[0], melted_df['ts_1'].iloc[-1] + pd.to_timedelta("00:10:00")], y=[average_time_2_3, average_time_2_3], mode='lines', 
                             name='Average time process 2', line_color='orange', line=dict(dash='dot'))) #'dodgerblue'
    
    fig.add_trace(go.Scatter(x=[melted_df['ts_1'].iloc[0], melted_df['ts_1'].iloc[-1] + pd.to_timedelta("00:10:00")], y=[average_time_3_4, average_time_3_4], mode='lines', 
                             name='Average time process 3', line_color='#FFD68A', line=dict(dash='dot'))) #'lightskyblue'
     
    
    
    fig.write_image("images/time_in_stat_" + dataset + "_" + day + "_" + month + "_" + ".pdf")
    #display(melted_df.loc[:10])

    fig.show()


df_bet_all_ = all_time_between_station(df_bet_12, df_bet_23,df_bet_34)
df_bet_all_['model'] = df_bet_all_['model'].replace({'SPT130023W': 'Y', 'SPT120018W': 'X', 'SPT140034W': 'Z'})
#display(df_bet_all_.loc[:100])
mask = df_bet_all_['ts_4'] >= df_bet_all_['ts_4'].shift(-1)
df_bet_all = df_bet_all_[~mask]


perc_all = perc_of_gabarits_found(df_bet_all, df4, 4)
plot_time_horizontal(df_bet_all)
#plot_time_in_station(df_bet_all)
#display(df_bet_all.loc[:500])
    

Percentage of gabarits scanned in every sensor: 15.06%, which of those occured OK:47 and KO:0




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call 

In [20]:
def avg_time_fun(df):
    # Group by 'model' and calculate the median for each group in df_med
    df_avg = pd.DataFrame(columns = ['date','model','_1_2', '_2_3', '_3_4','total_time'])

    df['ts_1'] = pd.to_datetime(df['ts_1'])
    date = df['ts_1'].dt.date

    df['total_time'] = df[['_1_2', '_2_3', '_3_4']].sum(axis=1)
    df_plot = df
    #display(df.loc[:20])

    df = df.drop(['ts_1', 'ts_2', 'ts_3', 'ts_4', 'gabarit','state'], axis=1) #doesn't matter
  
    avg_total_time = df.groupby('model').mean()
    df_avg = avg_total_time.reset_index()
    df_avg['date'] = date

    return df_plot, df_avg


def plot_ok_ko(df):
        #display(melted_df)
    color_discrete_map = {"OK": 'limegreen',
                          "KO": 'red'}
    #display(df)
    df.loc[:,'total_time'] = round(df['total_time']/60,2)
    fig = px.bar(df, x="ts_4", y='total_time', 
                labels={"ts_4": "Timestamp", 'total_time': "Time (minutes)"},
                #
                color = 'state',
                #base = 'ts',
                color_discrete_map = color_discrete_map,
                #base = 'state',
                #width=10, # customize width here
                #argap = 0.5,
                title="Total time of sensorized production (morning)",)
    target_date = pd.to_datetime(f"2023-{month}-{day}")

    fig.update_xaxes(range=[target_date + pd.to_timedelta("09:00:00"), target_date + pd.to_timedelta("13:00:00")])
  # Set your desired date range
    fig.show()

    fig2 = px.bar(df, x="ts_4", y='total_time', 
                labels={"ts_4": "Timestamp", 'total_time': "Time (minutes)"},
                color = 'state',
                color_discrete_map = color_discrete_map,
                #width=10, # customize width here
                #argap = 0.5,
                title="Total time of sensorized production (afternoon)")
    target_date = pd.to_datetime(f"2023-{month}-{day}")

    fig2.update_xaxes(range=[target_date + pd.to_timedelta("14:00:00"), target_date + pd.to_timedelta("18:00:00")])
  # Set your desired date range
    fig2.show()

#display(df_bet_all)
#median_time,median_total_time = 
df_sum_time, df_med = avg_time_fun(df_bet_all)

#df_sum_time = remove_outlier_quart(df_sum_time, 'total_time')
#plot_ok_ko(df_sum_time)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



This section will now gather a range of days and calculate the time in and between stations and then present the average per day.


In [22]:
start_date = '25/09/2023'
end_date = '11/10/2023'

start_date = pd.to_datetime(start_date, format='%d/%m/%Y')
end_date = pd.to_datetime(end_date, format='%d/%m/%Y')

# Create a date range
date_range = pd.date_range(start=start_date, end=end_date)

df_all_med = pd.DataFrame()

def aggregate_dates(df_all_med,date_range):
    
    print_v = 0
    for date in date_range:

        day = str(date.day).zfill(2)
        month = str(date.month).zfill(2)
        date_ = day + "_" + month
        print(date_)

        df1 = process_file("1",date_)
        if df1.empty:
            print("No data in that date")
            continue            
        df2 = process_file("2",date_)
        df3 = process_file("3",date_)
        df4 = process_file("final",date_)
    
        if df4.empty:
            print("No testing data in that date")
            continue     
        if print_v:
            print(df1)
            print(df2)
            print(df3)
            print(df4)

        df4['Time_Diff'] = df4['Stamp'].diff().dt.total_seconds()
        df4 = remove_top_n_outliers(df4, 'Time_Diff', 4)
        df4.loc[:,'Time_Diff'] = round(df4['Time_Diff']/60,2)


        df_in_1 = time_in_station_fun(df1)
        df_in_2 = time_in_station_fun(df2)
        df_in_3 = time_in_station_fun(df3)
        if print_v:
            print(df_in_1)
            print(df_in_2)
            print(df_in_3)


        df_bet_12 = time_between_station(df_in_1, df_in_2, 1, 2, max_time = 5)
        df_bet_23 = time_between_station(df_in_2, df_in_3, 2, 3, max_time = 20)
        df_bet_34 = time_between_station(df_in_3, df4, 3, 4, max_time = 0)
        if print_v:
            print(df_bet_12)
            print(df_bet_23)
            print(df_bet_34)

        df_bet_34['_3_4'] = df_bet_34['_3_4'].astype(int)
        df_bet_34 = remove_top_n_outliers (df_bet_34,'_3_4',3)


        perc1 = perc_of_gabarits_found(df_in_1, df4, 1)
        perc2 = perc_of_gabarits_found(df_in_2, df4, 2)
        perc3 = perc_of_gabarits_found(df_in_3, df4, 3)

        df_bet_all_ = all_time_between_station(df_bet_12, df_bet_23,df_bet_34)
        df_bet_all_['model'] = df_bet_all_['model'].replace({'SPT130023W': 'Y', 'SPT120018W': 'X', 'SPT140034W': 'Z'})
        mask = df_bet_all_['ts_4'] >= df_bet_all_['ts_4'].shift(-1)
        df_bet_all = df_bet_all_[~mask]
        perc4 = perc_of_gabarits_found(df_bet_all, df4, 4)
        if print_v:
            print(df_bet_all)
            display(df_med)
        _,df_med = avg_time_fun(df_bet_all)
        

        df_all_med = pd.concat([df_all_med, df_med])


    return df_all_med

df_all_med = aggregate_dates(df_all_med,date_range)
df_all_med['total_time'] = df_all_med['total_time'] /60
df_all_med['_1_2'] = df_all_med['_1_2'] /60
df_all_med['_2_3'] = df_all_med['_2_3'] /60
df_all_med['_3_4'] = df_all_med['_3_4'] /60






25_09
Percentage of gabarits scanned in sensor 1: 31.44%
Percentage of gabarits scanned in sensor 2: 65.55%
Percentage of gabarits scanned in sensor 3: 46.15%
Percentage of gabarits scanned in every sensor: 10.37%, which of those occured OK:30 and KO:1
26_09
No data in that date
27_09




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 82.01%
Percentage of gabarits scanned in sensor 2: 23.17%
Percentage of gabarits scanned in sensor 3: 48.78%
Percentage of gabarits scanned in every sensor: 8.84%, which of those occured OK:28 and KO:1
28_09




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 23.15%
Percentage of gabarits scanned in sensor 2: 32.8%
Percentage of gabarits scanned in sensor 3: 45.34%
Percentage of gabarits scanned in every sensor: 7.07%, which of those occured OK:21 and KO:1
29_09
No data in that date
30_09
No data in that date
01_10
No data in that date
02_10




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 51.19%
Percentage of gabarits scanned in sensor 2: 44.75%
Percentage of gabarits scanned in sensor 3: 40.0%
Percentage of gabarits scanned in every sensor: 16.27%, which of those occured OK:46 and KO:2
03_10




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 75.0%
Percentage of gabarits scanned in sensor 2: 60.06%
Percentage of gabarits scanned in sensor 3: 48.05%
Percentage of gabarits scanned in every sensor: 24.35%, which of those occured OK:72 and KO:3
04_10
No data in that date
05_10
No data in that date
06_10




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 73.31%
Percentage of gabarits scanned in sensor 2: 11.14%
Percentage of gabarits scanned in sensor 3: 48.97%
Percentage of gabarits scanned in every sensor: 3.52%, which of those occured OK:12 and KO:0
07_10
No data in that date
08_10
No data in that date
09_10




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 75.52%
Percentage of gabarits scanned in sensor 2: 51.38%
Percentage of gabarits scanned in sensor 3: 61.03%
Percentage of gabarits scanned in every sensor: 20.69%, which of those occured OK:56 and KO:4
10_10




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 86.08%
Percentage of gabarits scanned in sensor 2: 46.28%
Percentage of gabarits scanned in sensor 3: 63.43%
Percentage of gabarits scanned in every sensor: 19.42%, which of those occured OK:60 and KO:0
11_10




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Percentage of gabarits scanned in sensor 1: 79.62%
Percentage of gabarits scanned in sensor 2: 27.92%
Percentage of gabarits scanned in sensor 3: 64.53%
Percentage of gabarits scanned in every sensor: 9.06%, which of those occured OK:23 and KO:1




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
"""
def plot_time_in_station_days(df):
    display(df)
    
    color_discrete_map_ = {"Y": 'dodgerblue',
                          "X": 'red',
                          "Z": 'limegreen'
                          }


    # Create a bar plot for each model
    df['total_time'] = df['total_time']/60
    #df['med_total_time'] = df['med_total_time'].astype(int)
    fig = px.bar(df, x='date', y='total_time', 
                 color='model', 
                 color_discrete_map = color_discrete_map_,
                 barmode='group')

    # Update layout for better visualization

    fig.update_layout(
        title='Median Total Time by Model',
        xaxis_title='Date',
        yaxis_title='Median Total Time (minutes)',
        legend_title='Model',
        barmode='group'
    )

    # Show the plot
    fig.show()

    melted_df = pd.melt(df, id_vars=['date','model','total_time'], value_vars=['_1_2','_2_3','_3_4'],var_name='time_bet', value_name='time')    
    melted_df['color'] = melted_df['model'] + melted_df['time_bet']
    color_discrete_map = {"Y_1_2": 'darkblue',
                          "Y_2_3": 'dodgerblue', 
                          "Y_3_4": 'lightskyblue',

                          "X_1_2": 'maroon',
                          "X_2_3":'red',
                          "X_3_4": 'lightcoral',

                          "Z_1_2": 'forestgreen',
                          "Z_2_3": 'limegreen', 
                          "Z_3_4": 'lightgreen'}



    #model	time_bet_1_2, time_bet_2_3, time_bet_3_4, avg_total_time, date
    
    for model in melted_df['model'].unique():
        df_model = melted_df[melted_df['model'] == model]

        df_model.loc[:,'time_'] = df_model['time']/60
        print("yo")
        #df_model['time'] = df_model['time'].astype(int)
        # Create a bar plot for the current model
        fig4 = px.bar(df_model, x='date', y='time_', 
                    color='color',
                    labels={'time': 'Time'},
                    color_discrete_map = color_discrete_map,
                    title=f'Time per process for Model: {model}')
        


        #
        fig4.update_xaxes(range=[start_date - pd.Timedelta(days = 1), end_date + pd.Timedelta(days = 1)])  # Set your desired date range


        # Update layout for better visualization
        fig4.update_layout(
            xaxis_title='Date',
            yaxis_title='Time per station (minutes)',
            legend_title='Time Bet',
        )

        # Show the plot
        fig4.show()


plot_time_in_station_days(df_all_med)
"""

'\ndef plot_time_in_station_days(df):\n    display(df)\n    \n    color_discrete_map_ = {"Y": \'dodgerblue\',\n                          "X": \'red\',\n                          "Z": \'limegreen\'\n                          }\n\n\n    # Create a bar plot for each model\n    df[\'total_time\'] = df[\'total_time\']/60\n    #df[\'med_total_time\'] = df[\'med_total_time\'].astype(int)\n    fig = px.bar(df, x=\'date\', y=\'total_time\', \n                 color=\'model\', \n                 color_discrete_map = color_discrete_map_,\n                 barmode=\'group\')\n\n    # Update layout for better visualization\n\n    fig.update_layout(\n        title=\'Median Total Time by Model\',\n        xaxis_title=\'Date\',\n        yaxis_title=\'Median Total Time (minutes)\',\n        legend_title=\'Model\',\n        barmode=\'group\'\n    )\n\n    # Show the plot\n    fig.show()\n\n    melted_df = pd.melt(df, id_vars=[\'date\',\'model\',\'total_time\'], value_vars=[\'_1_2\',\'_2_3\',\'_3_4\'],va

In [73]:
def plot_time_in_station_days(df):
    #display(df)
    
    color_discrete_map_ = {"Y": 'orange',
                          "X": 'dodgerblue',
                          "Z": 'limegreen'
                          }


    # Create a bar plot for each model
    #df['med_total_time'] = df['med_total_time'].astype(int)
    fig1 = px.bar(df, x='date', y='total_time', 
                 color='model', 
                 color_discrete_map = color_discrete_map_,
                 barmode='group')

    # Update layout for better visualization

    fig1.update_layout(
        title='Average total time by todel',
        xaxis_title='Date',
        yaxis_title='Time (minutes)',
        legend_title='Model',
        barmode='group'
    )


    # Show the plot
    fig1.show()



    melted_df = pd.melt(df, id_vars=['date','model','total_time'], value_vars=['_1_2','_2_3','_3_4'],var_name='time_bet', value_name='time')    
    melted_df['color'] = melted_df['model'] + melted_df['time_bet']
    color_discrete_map = {"Y_1_2": 'orangered',
                          "Y_2_3": 'orange', 
                          "Y_3_4": '#FFD68A',
                          
                          "X_1_2": 'darkblue',
                          "X_2_3":'dodgerblue',
                          "X_3_4": 'lightskyblue',

                          "Z_1_2": 'forestgreen',
                          "Z_2_3": 'limegreen', 
                          "Z_3_4": 'lightgreen',}
    


    combined_df = pd.DataFrame()

    #
    for model in melted_df['model'].unique():
        df_model = melted_df[melted_df['model'] == model]
        combined_df = pd.concat([combined_df, df_model], ignore_index=True)

    # Create a bar plot for all models
    fig = px.bar(combined_df, x='date', y='time',
                color='color',
                labels={'time': 'Time (minutes)'},
                color_discrete_map=color_discrete_map,
                title='Average time per process')

    # Set your desired date range for x-axis


    # Update layout for better visualization
    fig.update_layout(
        xaxis_title='Date',
        yaxis_title='Time (minutes)',
        legend_title='Model_time between stations',
    )

    fig.update_xaxes(showgrid=True) 
    fig.update_layout(font=dict(size=14))
    fig.update_layout(width=1050, height = 500) 
    if save:
        fig.write_image("images/avg_time_" + dataset + "_all_days" + ".pdf")


    # Show the combined plot
    fig.show()
save = 1
plot_time_in_station_days(df_all_med)

In [25]:
#df_final differences df4


def plot_ok_ko_factory(df):
        #display(melted_df)
    color_discrete_map = {'Scanned OK': 'limegreen',
                          'Scanned KO': 'red',
                          'Not scanned': 'lightgray'
                          }
    #display(df)
    
    fig = px.bar(df, x="Stamp", y='Time_Diff', 
                labels={"Stamp": "Timestamp", 'Time_Diff': "Time differences (minutes)"},
                color = "State",
                color_discrete_map = color_discrete_map,
                title="Products scanned in all nodes",)
    target_date = pd.to_datetime(f"2023-{month}-{day}")

    average_time_diff = df['Time_Diff'].mean()
    fig.add_trace(go.Scatter(x=[df4['Stamp'].iloc[0], df4['Stamp'].iloc[-1] + pd.to_timedelta("00:05:00")], y=[average_time_diff, average_time_diff], mode='lines', 
                             name='Average time'))

    fig.update_xaxes(range=[target_date + pd.to_timedelta("09:00:00"), target_date + pd.to_timedelta("13:00:00")])
    fig.update_yaxes(range=[0,12.5])
    fig.update_xaxes(showgrid=True) 
    fig.update_layout(font=dict(size=14))
    fig.update_layout(width=1050, height = 500) 
  # Set your desired date range
    fig.show()
    fig2 = fig
    fig2.update_xaxes(range=[target_date + pd.to_timedelta("15:00:00"), target_date + pd.to_timedelta("16:20:00")])
    fig2.update_layout(legend=dict( yanchor="top", y=0.96,xanchor="left",x=0.02))

    #fig.update_traces(width=30000)
  # Set your desired date range
    if save:
        fig2.write_image("images/scanned_okko.pdf")
    fig2.show()

save = 1

df4['check'] = df4['Stamp'].isin(df_bet_all['ts_4']).astype(int)

df4['State'] = np.where(df4['check'] == 0, 'Not scanned', 'Scanned ' + df4['Status'])
df4 = df4.drop('check', axis=1)#print(df4)


#ok ko verde e vermelho
# ver quais demorou mais 

In [26]:
plot_ok_ko_factory(df4)


The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

