# Imports

In [27]:
import os
import pandas as pd

# !!! CHANGE !!!
# Original code:
#  N/A
# Reason for change:
#  Kirsten never imported warnings but uses it in the code,
#  and I need re for later pattern-matching.
import warnings
import re

### Functions

In [78]:
def reorder_dataframe_by_timestamp(df):
  """
  This function reorders a dataframe based on timestamps in the "file" column.

  Args:
      df: A pandas dataframe.

  Returns:
      A new dataframe reordered by timestamps (or the original dataframe with warnings if parsing fails).
  """
  def extract_timestamp(filename):
    try:
      # Extract timestamp part (assuming format YYYY-MM-DD_Xhhmm.ss.fff.log)
      
      # !!! CHANGE !!!
      # Original code:
      #   return filename.split("_")[-1].split(".")[0]
      # Reason for change:
      #   This doesn't actually return the format that Kirsten specified.
      #   The underscore between day and hour means it will only return the
      #   hour and minute.
      return re.search(r'\d+-\d+-\d+_[\dh\.]+(?=\.\d+\.log$)', filename).group()
    
    except (IndexError, ValueError):
      warnings.warn(f"Failed to parse timestamp from filename {filename}.")
      return None  # Indicate parsing failure

  # Extract timestamp with informative error handling
  df['timestamp'] = df['file'].apply(extract_timestamp)

  # Try converting timestamps to datetime format (handle potential errors)
  try:
    from datetime import datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d_%Hh%M.%S")
  except ValueError:
    # If conversion fails for all timestamps, issue a general warning
    warnings.warn("Failed to convert all timestamps to datetime format. Check filename format.")
    print("Note: not all timestamp column vals be converted to datetime format.")
    pass
    
  # Sort dataframe by timestamp (dropna removes rows with missing timestamps)
  return df.sort_values(by='timestamp').dropna(subset=['timestamp'])


def is_list_in_order(number_list):
  """
  This function checks if a list of numbers is in order (1 to n).

  Args:
      number_list: A list of integers.

  Returns:
      True if the list is in order, False otherwise.
  """
  # Check if the list is empty or has only one element
  if len(number_list) <= 1:
    return True

  # Convert the list to integers
  number_list = [int(num) for num in number_list]

  # Check if the list is in ascending order
  return all(a <= b for a, b in zip(number_list, number_list[1:]))


def has_duplicates(number_list):
  """
  This function checks if a list contains duplicate items.

  Args:
      number_list: A list of any data type.

  Returns:
      True if there are duplicates, False otherwise.
  """
  return len(set(number_list)) != len(number_list)

### Load and check log files

In [79]:
path     = 'behavioral_data'
sub_dirs = os.listdir(path)

Load the log files and times when they were collected

Report any subjects with unusual number or order of runs


In [80]:
ordered_subs = []; other_subs = {'subject':[], 'behavioral_runs':[]}
disregard = {'subject':[], 'file':[]}

for sub in sub_dirs:
    
    add_to_dict  = False
    
    log_files    = [ path + '/' + sub + '/' + x for x in os.listdir(path + '/' + sub) if '.log' in x ]
    log_data     = [ pd.read_table(x,header=None) for x in log_files ] 
    new_log_data = []
    
    for d,f in zip(log_data,log_files):
        # !!! CHANGE !!!
        # Original code:
        #  if d[d[2].str.contains('.csv')].shape[0] > 0:
        # Reason for change:
        #  Clunky & inefficient, and the code below does the same thing.
        if d[2].str.contains('.csv').any():
            
            # !!! CHANGE !!!
            # Original code:
            #  run       = list(d[d[2].str.contains('.csv')][2].str.split(' '))[0][1][-5]
            # Reason for change:
            #  Clunky.
            import_statement = list(d[d[2].str.contains('.csv')][2])[0]
            run = re.search(r'(?<=movie_stim_)\d(?=\.csv)', import_statement).group()
            
            # !!! CHANGE !!!
            # Original code:
            #  d['file'] = f[27:]
            # Reason for change:
            #  Sometimes cuts off parts of files, leading to 'ubject...' filenames
            d['file'] = f.split('/')[-1]
            
            d['run' ] = run
            new_log_data.append(d)
            
        else:
            print(sub + ' has an early-stop log file'); 
            print('disregarding: '+ f); print()
            disregard['file'].append(f)
            disregard['subject'].append(sub)
    
    
    df = pd.concat(new_log_data)
    ordered_df = reorder_dataframe_by_timestamp(df)
    grouped_df = ordered_df.groupby(['timestamp','run'],as_index=False).count()
    ordered_runs = list(grouped_df['run'])
    
    if has_duplicates(ordered_runs):
        print(); print(sub+ ' has unusual order')
        print(ordered_runs); print()
        add_to_dict = True
    
    elif not is_list_in_order(ordered_runs):
        print(); print(sub+ ' has unusual order')
        print(ordered_runs); print()
        add_to_dict = True
        
    else: 
        ordered_subs.append(sub)
        
    if add_to_dict:
        
        other_subs['subject'].append(sub)
        other_subs['behavioral_runs'].append(ordered_runs)
        

subject_4 has an early-stop log file
disregarding: behavioral_data/subject_4/subject_4c_run_1_2023-08-24_13h14.00.218.log


subject_1 has unusual order
['1', '2', '3', '4', '4', '5', '6']


subject_9 has unusual order
['1', '2', '3', '4', '5', '6', '1', '2', '3', '4', '5', '6']

subject_12 has an early-stop log file
disregarding: behavioral_data/subject_12/subject_12_run_1_2023-09-01_14h48.28.289.log


subject_12 has unusual order
['1', '2', '3', '3', '5', '6', '4']

subject_20 has an early-stop log file
disregarding: behavioral_data/subject_20/subjec_20b_run_1_2023-11-02_17h48.21.836.log



In [55]:
odf = pd.DataFrame(other_subs)
odf

Unnamed: 0,subject,behavioral_runs


In [8]:
for idx,r in odf.iterrows():
    r.to_csv(r['subject']+'_unusual_run_order.csv')

In [9]:
ddf = pd.DataFrame(disregard)
ddf

Unnamed: 0,subject,file
0,subject_4,behavioral_data/subject_4/subject_4c_run_1_202...
1,subject_12,behavioral_data/subject_12/subject_12_run_1_20...
2,subject_20,behavioral_data/subject_20/subjec_20b_run_1_20...


In [10]:
for idx,r in ddf.iterrows():
    r.to_csv(r['subject']+'_disregard_files.csv')