In [None]:
%load_ext autoreload
%autoreload 2
#required packages
import os
import glob
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
from scipy import signal, stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from datetime import datetime
from tqdm import tqdm
#my funcs
from ecog_functions import *

In [18]:
#global params
SAMPLE_RATE = 1000
VERBOSE = False
OVERWRITE = False
DATA_SUFFIX = 'spectral_final'
SAVING_PATH = r'C:\Users\marty\Projects\sleep_new\res_temp'

In [19]:
#analysis function 
# remember to apply to a single channel and:
# only if subset len > 5 s for sleep, == 5 min for time
def get_spectral_params(subset, sr = SAMPLE_RATE):

    f, pxx = signal.periodogram(subset, fs = sr)
    maxpow = np.max(pxx)
    meanpow = np.mean(pxx)

    #get difference from mean for each power
    dmean = np.abs(pxx-meanpow)
    meanpowf = f[np.argmin(dmean)]

    #get difference from max for each power
    dmax = np.abs(pxx-maxpow)
    maxpowf = f[np.argmin(dmax)]

    return maxpow, meanpow, maxpowf, meanpowf

In [20]:
#load sorted sleep states
sorted =  pickle.load(open('./res_temp/sleep_sorted_data4.pkl', 'rb'))
#keep time, ID, and sleep
sorted = sorted[['id','sleep']]
#fix IDs
rename_id = {'5_':'5','6_':'6','7_':'7','8_':8}
sorted['id'] = sorted['id'].replace(rename_id).astype(int)

In [21]:
#paths - single hour
PATHS = glob.glob(r"G:\DATA\tween_cbn_sleep\*\timestamped\*.txt")
EXTRA_PATHS = r"C:\Users\marty\Documents\DATA\water_soluble_cbn_sleep\*\timestamped\*.txt"
PATHS.extend(glob.glob(EXTRA_PATHS))

In [22]:
to_exclude = [41, 51]

In [23]:
#set days so not all data is read
valid_days = ['20241703','20241803','20241903','20242003','20242103','20242203','20242303','20242403','20242503','20242603','20242703','20242803','20242903','20243003','20240104','20240204','20240304','20240404','20240504','20240604',
              '20242202','20242302','20242402','20242502','20242602','20242702','20242802','20242902','20240103','20240203','20240303','20240403','20240503', '20241103','20241203','20241303','20241403','20241503',
              '20230711','20230811','20230911','20231011','20231111','20231211','20231311','20231411','20231511','20231611','20231711','20231811','20231911', '20230112','20230212','20230312','20230412',
              '20232912','20233012','20233112','20240101','20240201','20240301','20240401','20240501','20240601','20240801','20240901','20241001','20241101','20241201','20241301','20241401','20241501','20242001','20242101','20242201','20242301','20242401','20242501','20242601',
              '20230504','20230604','20230704','20230804','20230904','20231004','20231104','20232705','20232805','20232905''20233005','20233105','20230106','20230206','20230306','20230406','20230506','20230606','20230706','20230806','20230906','20231006','20232206', '20232306', '20232406', '20232506', '20232606']


In [None]:
for path in tqdm(PATHS):
    id = int(path.split('\\')[-3])
    #check if ID is included
    if id in to_exclude:
        continue
    
    #check if path is in days
    if path.split('\\')[-1][:8] not in valid_days:
        continue
    
    #read file
    try:
        info = pd.read_csv(path, nrows=8, header = None)
        day = info.loc[0].str.strip('date:').astype(str)[0]
        hour = info.loc[1].str.strip('time:').astype(str)[0]
        date = pd.to_datetime(day + hour)
    except:
        if VERBOSE:
            print(f'file {path} read failed!')
        continue
    
    #now check if savepath exists
    date_save = str(date).replace(' ','_').replace(':','_')
    savepath = SAVING_PATH+f'\{id}_{date_save}_'+DATA_SUFFIX
    if not OVERWRITE:
        if os.path.isfile(savepath+'_state.pkl') or os.path.isfile(savepath+'_time.pkl'):
            if VERBOSE:
                print(f'{savepath} exists, skipping')
            continue
               
    try:
        data = pd.read_csv(path, skiprows=9, header=None)
        data = data.loc[:,0].str.split(' ', expand = True)
    except:
        if VERBOSE:
            print(f'file {path} read failed!')
        continue
    
    #now check len of data - keep only if half-hour available from the hour
    if len(data) < 180000:
        if VERBOSE:
            print(f'incomplete rec for {id} {date}')
        continue
    
    if VERBOSE:
        print(f'CHECKS PASSED, DATA READ: rat {id}, rec {date}')
        
    #now create time array
    time = pd.date_range(start=date, periods=len(data),freq='1ms')
    #subset sleep states by ID and time
    sleep_subset = sorted.loc[(sorted['id'] == id)&sorted.index.isin(time)]
    #remove duplicated indices
    sleep_subset = sleep_subset[~sleep_subset.index.duplicated(keep='first')]
    
    #now check if there is enough data in sleep states - keep only if 30 mins available
    if len(sleep_subset) < 1800:
        if VERBOSE:
            print(f'incomplete sleep states for {id} {date}')
        continue
    
    #set index of data
    data.index = time
    #convert to numeric
    try:
        data = convert_to_mv(data)
    except:
        if VERBOSE:
            print(f'CONVERSION ERROR, rat {id}, rec {date}')
        continue
    #filter data
    filtered_data=pd.DataFrame()
    filtered_data['l_ecog']=filter_channel(data['l_ecog'], fstart=0.5, fstop=45, sr=SAMPLE_RATE, center=True, notch_50=False)
    filtered_data['r_ecog']=filter_channel(data['r_ecog'], fstart=0.5, fstop=45, sr=SAMPLE_RATE, center=True, notch_50=False)
    filtered_data['lfp']=filter_channel(data['lfp'], fstart=0.5, fstop=45, sr=SAMPLE_RATE, center=True, notch_50=False)
    filtered_data['emg']=filter_channel(data['emg'], fstart=0.5, fstop=45, sr=SAMPLE_RATE, center=True, notch_50=True)
    filtered_data.index=data.index
    data = filtered_data
    #now join sleep states to data
    data['sleep'] = sleep_subset['sleep']
    #drop missing rows
    data = data.ffill().dropna()
    #get 5 min subsets
    data['time_adj'] = data.index.round('5min') 
    #get sleep state ids
    data['state_id'] = np.cumsum(np.abs(data['sleep'].replace({'W':0,'N':1,'R':2}).diff().fillna(0)))
    
    #do analysis by time
    if VERBOSE:
        print(f'ANALYSIS BY TIME STARTED: rat {id}, rec {date}')
    res_by_time = []
    times = data['time_adj'].unique()
    for time in times:
        #subset data
        data_sub = data.loc[data['time_adj'] == time]
        if len(data_sub) < 300000:
            if VERBOSE:
                print('not enough data, time subset')
            continue
        channels = ['l_ecog','r_ecog','lfp','emg']
        for channel in channels:
            maxpow, meanpow, maxpowf, meanpowf = get_spectral_params(data_sub[channel], sr = SAMPLE_RATE)
            #now save to res list
            res_row = pd.DataFrame([id, time, channel, maxpow, meanpow, maxpowf, meanpowf]).T
            res_row.columns = ['id','time','channel','maxpow','meanpow','maxpowf','meanpowf']
            res_by_time.append(res_row)
    if len(res_by_time)>0:
        pickle.dump(pd.concat(res_by_time), open(savepath+'_time.pkl', 'wb'))
    else:
        print('results not generated')
    print(f'ANALYSIS BY TIME DONE: rat {id}, rec {date}')
    
    #do analysis by state
    if VERBOSE:
        print(f'ANALYSIS BY STATE STARTED: rat {id}, rec {date}')
    res_by_state = []    
    state_ids = data['state_id'].unique()
    for state_id in state_ids:
        #get subset
        data_sub = data.loc[data['state_id']==state_id]
        #check if more than 5 seconds of the state are available
        if len(data_sub) < 5000:
            if VERBOSE:
                print('not enough data, state subset')
            continue
        #if more is available, get results
        channels = ['l_ecog','r_ecog','lfp','emg']
        for channel in channels:
            maxpow, meanpow, maxpowf, meanpowf = get_spectral_params(data_sub[channel], sr = SAMPLE_RATE)
            #now save to res list
            res_row = pd.DataFrame([id, state_id, data_sub.index[0], data_sub['sleep'].iloc[0], channel, maxpow, meanpow, maxpowf, meanpowf]).T
            res_row.columns = ['id','state_id','time','sleep','channel','maxpow','meanpow','maxpowf','meanpowf']
            res_by_state.append(res_row)
    if len(res_by_state)>0:
        pickle.dump(pd.concat(res_by_state), open(savepath+'_state.pkl', 'wb'))
    else:
        print('results not generated')
    print(f'ANALYSIS BY STATE DONE: rat {id}, rec {date}')