In [68]:
import plotly.express as px
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go

#import csv
pd.set_option('display.max_rows', None)  # Display all rows


In [69]:
if not os.path.exists("images"):
    os.mkdir("images")

day = "25"
month = "09"
dataset = "rfid"

date = day + "_" + month #change date accordingly
date_year = "2023-" + month + "-" + day

folder_path = dataset + '_' + date + '/'


def process_file(file_number, date):
    file_path = os.path.join(folder_path, f'rfid_{date}_{file_number}.csv')

    if file_number == "final":
        df = pd.read_csv(file_path, parse_dates=['Stamp'])
        if df.shape[1] == 4:
            df = df.drop(df.columns[3], axis=1)
            df = df.drop(df.columns[1], axis=1)

    else:
        df = pd.read_csv(file_path, parse_dates=['ts'])

        # Remove first empty column if it exists
        if df.shape[1] == 3:
            df = df.drop(df.columns[0], axis=1)
        # Remove entries with 'gabarit' equal to 0
        df= df[df['gabarit'] != 0]

        # Write the updated DataFrame back to the CSV file
        df.to_csv(file_path, index = False)
        # Reset the index of the DataFrame
        df = df.reset_index(drop = True)

    return df

df1 = process_file("1",date)
df2 = process_file("2",date)
df3 = process_file("3",date)
df4 = process_file("final",date)



In [70]:

def time_in_station_fun(df):
    """
    Calculates and records the time spent in a station for each occurrence of a 'gabarit' in a DataFrame in seconds.

    Parameters:
        - df: DataFrame containing timestamps and 'gabarit' values.

    Returns:
        DataFrame with an added 'time_in_station' column representing the time
        spent in the station for each occurrence (in seconds).
    """
    
    curr_gabarit = 0
    index_i = 0  # Initialize index_i outside the loop
    index_f = 0  # Initialize index_f outside the loop
    cnt = 0
    df_in = pd.DataFrame(columns=['ts', 'gabarit', 'time_in_station'])
    #df_in['ts'] = pd.to_datetime(df['ts'])

    for i in range(df.shape[0]): #0 to 129
        if i != 0: # not first index
            if ((df.loc[i, 'gabarit'] != curr_gabarit)):
                # routine to present final result
                df_in, cnt = calc_time_in_station(df, df_in, index_i, index_f, cnt)
            
          
            # new gabarit is found, i = f
            if df.loc[i, 'gabarit'] != df.loc[i-1, 'gabarit']:
                curr_gabarit = df.loc[i, 'gabarit']
                index_i = i # starts the interval
                index_f = i
        else: # for first entry, i = f
            curr_gabarit = df.loc[i, 'gabarit']
            index_i = i
            index_f = i

        
        if i != df.shape[0] - 1: # not last index 
            if df.loc[i, 'gabarit'] == df.loc[i+1, 'gabarit']:
                index_f = i+1
        else:
            index_f = i           
            df_in, cnt = calc_time_in_station(df, df_in, index_i, index_f, cnt)

    return df_in


# calculates time and writes it in dataframe
def calc_time_in_station(df, df_in, index_i, index_f, cnt):
    if index_f - index_i == 0:
        time_in_station = 1
    else: 
        time_in_station = ((df.loc[index_f, 'ts'] - df.loc[index_i, 'ts']).total_seconds() + 1)

    #print(f"time_in_station is {time_in_station}")
    df_in.loc[cnt,'ts'] = df.loc[index_i, 'ts']
    df_in.loc[cnt,'gabarit'] = df.loc[index_i, 'gabarit']
    df_in.loc[cnt,'time_in_station'] = time_in_station

    #df.loc[index_f, 'time_in_station'] = round(time_in_station,2)
    cnt += 1

    return df_in, cnt


def plot_time_in_station(df, station):
    df = df[df['time_in_station'] != 0]
    # Create a scatter plot using Plotly Express
    if station == 1:
        title_plot_var = 'Begginning of Stage 3'
    elif station == 2:
        title_plot_var = 'End of Stage 3'
    elif station == 3:
        title_plot_var = 'Begginning of Stage 7'

    title_plot = 'Plot of Time in ' + title_plot_var + ' over Time'

    ts_values = np.array(df['ts'])
    df['time_in_station'] = df['time_in_station'] / 60

    # Create a scatter plot using go.Scatter
    scatter = go.Scatter(x=ts_values, y=df['time_in_station'], mode='markers', 
                        name='Scatter Plot', marker=dict(color='blue'))

    # Create a layout
    layout = go.Layout(
        title=title_plot,
        xaxis=dict(title='Timestamp', range=[date_year + ' 09:00:00', date_year + ' 18:00:00']),  # Set the range for the x-axis
        yaxis=dict(title='Time in Station', range=[-0.2, 3])  # Set the range for the y-axis
    )

    # Create a figure and add the scatter trace
    fig = go.Figure(data=[scatter], layout=layout)

    # Show the plot
    fig.show()
    fig.write_image("images/fig_" + dataset + "_" + day + "_" + month + "_" + str(station) + ".svg")



In [71]:
def remove_outliers(df, max_time):
    # Ensure the column 'time_in_station' exists
    if 'time_in_station' not in df.columns:
        print("Error: 'time_in_station' column not found.")
        return df
    
    

    # Remove rows where 'time_in_station' is greater than 5 minutes
    df = df[df['time_in_station'] <= max_time / 60]

    return df

In [72]:
plot = 0
display_df = 0

df_in_1 = time_in_station_fun(df1)
df_in_2 = time_in_station_fun(df2)
df_in_3 = time_in_station_fun(df3)

if display_df:
    display(df_in_1)
    display(df_in_2)
    display(df_in_3)

if plot:
    plot_time_in_station(df_in_1,1)
    plot_time_in_station(df_in_2,2)
    plot_time_in_station(df_in_3,3)



In [73]:


def time_between_station(df1, df2, station1, station2, max_time):
    """
    Finds and records the time difference between occurrences of the same 'gabarit' in two DataFrames,
    df1 and df2, within a specified time window.

    Parameters:
        - df1: DataFrame containing timestamps and 'gabarit' values for station1.
        - df2: DataFrame containing timestamps and 'gabarit' values for station2.
        - station1: Identifier for the first station.
        - station2: Identifier for the second station.
        - max_time: Maximum time window to search for matching occurrences (in minutes).
    
    Returns:
        DataFrame with columns 'gabarit', 'ts' (timestamp), and 'time_bet_station1_station2'
        representing the time difference (in minutes) between corresponding occurrences in
        station1 and station2 within the specified time window.
    """
    
    df_bet = pd.DataFrame(columns=['gabarit', f'time_bet_{station1}_{station2}'])
    df_bet['ts'] = pd.NaT  # initialize as datetime type

 
    if (station1 != 3 and station2 != 4): ##1 2 or 2 3 
        column_order = ['ts', 'gabarit', f'time_bet_{station1}_{station2}']
        df_bet = df_bet[column_order]
        df_bet = find_matching_gabarit(df1, df2, station1, station2, max_time, df_bet)
    else:
        df_bet['model'] = np.nan
        column_order = ['ts', 'gabarit', 'model', f'time_bet_{station1}_{station2}']
        df_bet = df_bet[column_order]
        df_bet = find_matching_gabarit_final(df1, df2, df_bet)

    return df_bet


def find_matching_gabarit(df1,df2,station1, station2, max_time, df_bet):

    cnt = 0
            
    for index in range(df1.shape[0]):
        start_time = df1.loc[index, 'ts']
        end_time = start_time + pd.Timedelta(minutes=max_time)
        matching_rows = df2[(df2['ts'] >= start_time) & 
                            (df2['ts'] <= end_time) 
                                & (df2['gabarit'] == df1.loc[index,'gabarit'])]
        if not matching_rows.empty:
            first_occurrence = matching_rows.iloc[0]
            
            # df1: initial time stamp, time_in_station_1
            # df2: initial time stamp, time_in_station_2
            if (station1 == 1 and station2 == 2):
                time_bet = (first_occurrence['time_in_station'] +
                        (first_occurrence['ts'] - df1.loc[index, 'ts']).total_seconds()) # in seconds
            else:
                time_bet = ((first_occurrence['ts'] - df1.loc[index, 'ts'] # in seconds
                            - pd.Timedelta(seconds = df1.loc[index, 'time_in_station'])).total_seconds())

                #df1: 10:06:00 17
                #     10:07:30 17
                #df2: 10:07:10 17
            #    time_bet = (df1.loc[index, 'time_in_station'] + first_occurrence['time_in_station'])

            df_bet.loc[cnt, 'ts'] = df1.loc[index, 'ts']  # Copy the timestamp for reference
            df_bet.loc[cnt, 'gabarit'] = first_occurrence['gabarit']
            df_bet.loc[cnt, f'time_bet_{station1}_{station2}'] = time_bet #round,2
            cnt += 1
        
    return df_bet

def find_matching_gabarit_final(df1, df2, df_bet):

    cnt = 0
            
    for index in range(df1.shape[0]):
        start_time = (df1.loc[index, 'ts'] + pd.Timedelta(seconds = df1.loc[index, 'time_in_station'])
                      + pd.Timedelta(seconds=20)) # not possible to have a test that takes less than 20 seconds
        matching_rows = df2[(df2['Stamp'] >= start_time)]


        if not matching_rows.empty:
            first_occurrence = matching_rows.iloc[0]
            
            # df1: initial time stamp, time_in_station_1
            # df2: initial time stamp, time_in_station_2
            time_bet = ((first_occurrence['Stamp'] - df1.loc[index, 'ts']).total_seconds()) # in minutes

            df_bet.loc[cnt, 'ts'] = df1.loc[index, 'ts']  # Copy the timestamp for reference
            df_bet.loc[cnt, 'gabarit'] = df1.loc[index,'gabarit']
            #print(cnt)
            #print(type(first_occurrence['Model']))
            #print(first_occurrence['Model'])
            df_bet.loc[cnt, 'model'] = str(first_occurrence['Model'])
            df_bet.loc[cnt, f'time_bet_3_4'] = time_bet
            cnt += 1
        
    return df_bet



display_df_ = 0

df_bet_12 = time_between_station(df_in_1, df_in_2, 1, 2, max_time = 10)
df_bet_23 = time_between_station(df_in_2, df_in_3, 2, 3, max_time = 20)
df_bet_34 = time_between_station(df_in_3, df4, 3, 4, max_time = 0)

if display_df_:
    display(df_bet_12)
    display(df_bet_23)
    display(df_bet_34)






Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'HAMLET-49' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.



In [79]:

def all_time_between_station(df12, df23, df34):
    #df_bet_all = pd.DataFrame(columns = ['ts1','time_bet_12','ts2', 'ts2','time_bet_23','ts3','time_bet_34','gabarit','model'])
    df_bet_all = pd.DataFrame(columns = ['ts','gabarit', 'model','time_bet_1_2','time_bet_2_3','time_bet_3_4']) 
    
    cnt = 0
    cnt2 = 0
    for index in range(df12.shape[0]):
        start_time = (df12.loc[index, 'ts'] + pd.Timedelta(seconds = df12.loc[index, 'time_bet_1_2'])
                       - pd.Timedelta(seconds=2))
        end_time = start_time + pd.Timedelta(seconds=3)

        matching_rows = df23[(df23['ts'] >= start_time) & (df23['ts'] <= end_time) 
                                & (df23['gabarit'] == df12.loc[index,'gabarit'])]
        if not matching_rows.empty:
            first_occurrence = matching_rows.iloc[0]

            df_bet_all.loc[cnt, 'ts'] = df12.loc[index, 'ts']  # Copy the timestamp for reference
            df_bet_all.loc[cnt, 'gabarit'] = first_occurrence['gabarit']
            df_bet_all.loc[cnt, 'time_bet_1_2'] = df12.loc[index, 'time_bet_1_2'] 
            df_bet_all.loc[cnt, 'time_bet_2_3'] = first_occurrence['time_bet_2_3'] 
            cnt += 1

            start_time2 = (first_occurrence['ts'] + pd.Timedelta(seconds = first_occurrence['time_bet_2_3']) 
                          - pd.Timedelta(seconds=1))
            end_time2 = start_time2 +   pd.Timedelta(seconds=3)
            matching_rows2 = df34[(df34['ts'] >= start_time2) & (df34['ts'] <= end_time2) 
                                & (df34['gabarit'] == first_occurrence['gabarit'])]
            
            if not matching_rows2.empty:
                first_occurrence2 = matching_rows2.iloc[0]
           
                df_bet_all.loc[cnt2, 'time_bet_3_4'] = first_occurrence2['time_bet_3_4']
                #print(first_occurrence2)
                df_bet_all.loc[cnt2, 'model'] = str(first_occurrence2['model'])
  
                cnt2 += 1

    return df_bet_all


def plot_time_in_station(df):
    #display(px.data.medals_wide())

    df['time_bet_1_2'] = df['time_bet_1_2'].astype(int)
    df['time_bet_2_3'] = df['time_bet_2_3'].astype(int)
    df['time_bet_3_4'] = df['time_bet_3_4'].astype(int)
    melted_df = pd.melt(df, id_vars=['ts','model'], value_vars=['time_bet_1_2','time_bet_2_3','time_bet_3_4'],var_name='time_bet', value_name='time')
    

    melted_df['color'] = melted_df['model'] + '_' + melted_df['time_bet']
    color_discrete_map = {"HAMLET-49_time_bet_1_2": 'darkblue',
                          "HAMLET-49_time_bet_2_3": 'cornflowerblue', 
                          "HAMLET-49_time_bet_3_4": 'lightblue',
                          "SPT120018W_time_bet_1_2": 'maroon',
                          "SPT120018W_time_bet_2_3":'red',
                          "SPT120018W_time_bet_3_4": 'lightcoral'}

    #display(melted_df)
    melted_df["time"] = round(melted_df["time"]/60,2)
    fig = px.bar(melted_df, x="ts", y="time", 
                labels={"ts": "Timestamp", "time": "Time (minutes)"},
                color = 'color',
                color_discrete_map = color_discrete_map,
                title="Time Between Stations")
    #fig.update_xaxes(range=['2023-09-25 11:00:00', '2023-09-25 15:00:00'])
    #fig.update_traces(width=10)


    fig.show()




df_bet_all = all_time_between_station(df_bet_12, df_bet_23,df_bet_34)
perc_all = perc_of_gabarits_found(df_bet_all,df4, sensor = 4)

#display(df_bet_all)
plot_time_in_station(df_bet_all)



#plot_time_
    

Percentage of gabarits scanned in all sensors: 7.52%



The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result

