### Track analysis of viral particles (influenza), incubated on CHO cells expressing human transferrin receptor

collaboration with Beryl Mazel-Sanchez from Mirko Schmolke's lab

V03, 06/07/2022, Michael Bachmann

In [33]:
# preparation: 

# files from branch tracing (= summary of tracks) and
# files "tracks" (= details about tracks including intensity in 
# different channels) 

# files named with date of experiment in format "yyyymmdd" and split from other information in file name
# by "_"

# identifier of analyzed cell in the file name is a number with maximally 2 digits, separated
# from anything else by a space, and there is no other number in the file name with one or two digits
# (relies on automatic annotation of ImageJ for different scenes of a video; "series 1", "series 2", etc)

# Videos have to be 30 min long (31 time points) with one frame every min

# here, data is assumed to be based on pixel scaling, not micron. values will be transformed into micron values 
# based on 1 micron being 7.8604 px / 1 px = 0.1272 px

# general directory that contains subfolder named "all" that contains all files
directory = '/Users/michaelbachmann/Python/viral particles/final analysis/BFP_HA VLP/'

path_all = directory + 'all/'

# factor for bleach correction, calculated separately
bleach_cor = 0.76

In [34]:
## bunch of functions that are called later

def ident_replicate(file_name):
    """
    file name will be split based on '_' 
    
    --> Make sure that filename has date 
    separated by underscore and has date in format yyyymmdd. Date is recognized 
    as being an integer > 20000000 meaning that having another integer in the 
    file name that is not the date of experiment will create problems.
    
    Input is name of a file, return value is date of experiment as integer 
    """
    
    file_parts = file_name.split('_') #split file name in parts identified by "_" between them
    
    exp_date = 0 #variable to hold date of experiment
    
    # this loop goes through the split file name and looks with "try int()" for possible 
    # integers; if an integer is bigger than 2000 00 00 it is considered to be a date 
    # and is used as output of the function
    for index, i in enumerate(file_parts):
        try:
            date = int(file_parts[index])
            if date > 20000000:
                exp_date = date
        except:
            pass
    
    return(exp_date)

def ident_cell (file_name):
    """
    Identify cell from "series ..." information in file name. Identifier needs to be an integer
    that is separated from other information with spaces and that has maximally two digits.
    
    Input is name of a file, return is the cell number as writen in the file name.
    """
    
    file_parts = file_name.split(' ') #split file name in parts identified by "_" between them
    
    cell_number = -1
    
    for index, i in enumerate(file_parts):
        if len(file_parts[index]) < 3:
            
            try:
                cell_number = int(file_parts[index])
            except:
                pass
        
        else:
            if cell_number == -1: 
                cell_number = 'NA' #if cell_number hasn't been changed (because no cell nr could have been 
                                   #found) cell_number will be returned as 'NA' to indicate a problem

    return(cell_number)

def is_file_empty_3(file_name):
    """ Check if file is empty by reading first character in it. Return will be True if file is empty or
    false when file contains something already.
    """
    # open file in read mode
    
    with open(file_name, 'r') as read_obj:
        # read first character
        one_char = read_obj.read(1)
        # if not fetched then file is empty
        if not one_char:
            return True
    return False
    

def create_sum (file_sum_path, file_sum_name, counter, replicate_id, cell_tracker, min_distance=0, min_veloc=0):
    """ 
    Read data from file and create new file with reduced amount of information.
    
    Input is path of file (file_sum_path), file name (file_sum_name), counter was initially used to identify
    cells but is currently not used anymore, cell_tracker obtained from function 'ident_cell' to identify 
    separate cells, and date of experiment as ID for independent experiments (replicate_id)
    
    Tracks below a certain distance length or velocity can be discarded with optional values
    min_distance and min_veloc. Otherwise these values are set to 0. Condition for both is '>=' compared
    to chosen value
    """
    
    #output file will be named as input file with "output" added and format will be tsv instead of csv
    file_sum_out = file_sum_name[0:-4] + '_output.tsv'
    
    with open (file_sum_path, 'r') as f_in, open (file_sum_out, 'w') as f_out:
        
        # index for counting lines
        i = 1
        
        #printing header for output file
        print ('replicate', 'cell', 'ID', 'track duration (min)', 
               'track distance (micron)', 'track velocity (micron/min)', 
               sep='\t', file=f_out)
        
        #skipping header in in put file
        while i < 5:
            header = f_in.readline()
            i += 1
        
        #read through lines and gather needed data for output file
        for line in f_in:
            data = line.strip().split(',')
            track_id = int(data [1])
            track_duration = float(data [4])
            track_distance = float(data [5])
            track_veloc = float(data [6])
            
            if (track_distance >= min_distance) and (track_veloc >= min_veloc) :
                print (replicate_id, cell_tracker, track_id, track_duration, (track_distance/7.8604),
                      (track_veloc/7.8604), sep='\t', file=f_out)
                #division of distance and velocity by 7.8604 because of scaling factor between px and micron
            
def create_detail (file_detail_path, file_detail_name, counter, replicate_id, 
                   bleach_correction, cell_tracker, track_length=1):
    """ 
    Read tracking data from TrackMate file. Problem: data isn't sorted according to time frame for given 
    track 
    
    This function reads through the file and creates a new dictionary for every track with the whole line 
    as value and the time frame as key. This dic will be sorted according to their key values = time frame
    
    After this sorting, the dictionary will be read key by key and the whole line will be split and
    the necessary information is saved in a new file. After this, the dic will be deleted to be ready
    for the data from the next track ID.
    
    Inputs are file path (file_detail_path), file name (file_detail_name), counter to identify separate cells 
    (not used anymore), date of experiment as ID for independent experiments (replicate_id), 
    a value for bleach correction that was identified independently (bleach_correction), a cell_tracker nr 
    obtained with function ident_cell, and an optional value for the minimum length of a track 
    (track length) with default value of 1. Condition for track length is >=.
    """
    
    # this part skips the header and stores the maximal number of measurements in line_max
    # line_max is needed to identify and safe the last entry in the actual run
    with open (file_detail_path, 'r') as f_in:

        line_max = 0
        i = 1

        while i < 5:            #skip header
            f_in.readline()
            i += 1

        for line in f_in:      #count number of lines with data
            line_max += 1
    
        
    # actual program to read and sort entries per track ID
    
    # create file names for the general file and the one with transposed values for one feature
    # an additional file in append mode is created to list all tracks according to "no entry" (= track 
    # still present in time frame 30) or "entry" (not present in time frame 30)
    f_d_o_e = file_detail_name[0:-4] + '_output_entry_bleach-cor.tsv'
    f_d_o_ne = file_detail_name[0:-4] + '_output_NOentry_bleach-cor.tsv'
    f_d_o_e_t = file_detail_name[0:-4] + '_output_entry_trans_median2.tsv'
    f_d_o_ne_t = file_detail_name[0:-4] + '_output_NOentry_trans_median2.tsv'
    e_vs_ne_output = 'entry_vs_no-entry_file.tsv'
    
    with open (file_detail_path, 'r') as f_in, open (f_d_o_e, 'w') as f_out_e, open (f_d_o_ne, 'w') as f_out_ne, open (f_d_o_e_t, 'w') as f_out_e_t, open (f_d_o_ne_t, 'w') as f_out_ne_t, open (e_vs_ne_output, 'a') as f_out_entry:
        
        i = 1
        
        # print header in new file with columns we need
        print ('replicate', 'cell', 'ID', 'frame', 'median int ch 1', 'median int ch 1 corrected',
               'median int ch 2', 'median int ch 2 corrected', 'median int ch 3', 'median int ch 3 corrected',
               'contrast ch 1', 'contrast ch 2', 'contrast ch 3',
               sep='\t', file=f_out_e) 
        print ('replicate', 'cell', 'ID', 'frame', 'median int ch 1', 'median int ch 1 corrected',
               'median int ch 2', 'median int ch 2 corrected', 'median int ch 3', 'median int ch 3 corrected',
               'contrast ch 1', 'contrast ch 2', 'contrast ch 3',
               sep='\t', file=f_out_ne)
        print ('ID', 'cell', 'track', 'median int 2 ch 2 corrected', sep='\t', file=f_out_e_t)
        print ('ID', 'cell', 'track', 'median int 2 ch 2 corrected', sep='\t', file=f_out_ne_t)
        
        # for entry/no entry file which is append mode: write header only in empty file. function from above will
        # be called
        if is_file_empty_3(e_vs_ne_output):
            print ('replicate', 'cell', 'ID', 'entry vs no entry', sep='\t', file=f_out_entry)
        
        #skip header in input file
        while i < 5:
            f_in.readline()
            i += 1
        
        #id_identifier is later used to compare ID of new line with ID of line before to identify change 
        #to a new track
        id_identifier = 0
        
        #dictionary that will hold all lines from the same track ID and that will be sorted according to
        #frame value
        dic_tracks = {}
        
        #variables for identifying lines and sorting and counting line nr to know when last line is reached
        tr_id_global = 0
        tr_frame_global = 0
        line_counter = 0
        
        #read through lines
        for line in f_in:
            line_counter += 1
            data = line.strip().split(',')
            
            #variables to measure ID and frame of the current line
            tr_id_global = int(data [2])
            tr_frame_global = int(data[8])
            
            #start a dictionary as long as track ID is the same AND loop hasn't reached 
            #the last entry
            if (tr_id_global == id_identifier) and (line_counter != line_max):
        
                #create a dic where frame is used as key, whole line is value
                #this allows later to sort all lines according to the frame 
                if not tr_frame_global in dic_tracks:
                    dic_tracks[tr_frame_global] = []
                
                dic_tracks[tr_frame_global].append(line)
            
            # when loop reaches a line with a new, different track ID, the existing dic with 
            # all entries for the old track ID will be sorted according to their frame value
            # afterwards, values of new track ID will be stored in file
            elif (tr_id_global != id_identifier) and (line_counter != line_max):
                
                #actual sorting according to keys which is the number of frame in this dic
                dic_tracks = dict(sorted(dic_tracks.items()))
                
                #test if track has at least minimal length 
                #track_length = 1 if no value has been used as input
                if len(dic_tracks) >= track_length:
                    
                    #decide if track counts as entering track or not based on existence of time frame 30
                    #functions will be called to write new file with sorted lines and another function
                    #to create a transposed file of one measured value and additional information will
                    #be saved into entry/no entry file for comparison later on
                    if 30 in dic_tracks:
                        sorting_lines(f_out_ne, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        transposed_lines(f_out_ne_t, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        print(replicate_id, cell_tracker, id_identifier, 'no entry', sep='\t', file=f_out_entry)

                    elif 30 not in dic_tracks:
                        sorting_lines(f_out_e, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        transposed_lines(f_out_e_t, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        print(replicate_id, cell_tracker, id_identifier, 'entry', sep='\t', file=f_out_entry)
                    
                    #for trouble shooting
                    else:
                        print('neither longer nor shorter than 30 tracks?')
                
                #now, dictionary will be emptied...
                dic_tracks = {}
                #...a new list will be created with the current frame as key...
                dic_tracks[tr_frame_global] = []
                #...and the current line will be saved linked to this key (=frame)
                dic_tracks[tr_frame_global].append(line)
                #updating id identifier to indicate the new track
                id_identifier = tr_id_global
            
            # this will be triggered when the last line is reached in order to sort and save data from the 
            # last track
            elif line_counter == line_max:
                
                #data from the current line has to be saved
                if not tr_frame_global in dic_tracks:
                    dic_tracks[tr_frame_global] = []
                
                dic_tracks[tr_frame_global].append(line)
                
                #do the sorting for the current track ID
                dic_tracks = dict(sorted(dic_tracks.items()))
                
                #only save data if track length fulfills length criterion
                if len(dic_tracks) >= track_length:
                    
                    # as before, test for entry vs no entry
                    if 30 in dic_tracks:
                        sorting_lines(f_out_ne, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        transposed_lines(f_out_ne_t, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        print(replicate_id, cell_tracker, tr_id_global, 'no entry', sep='\t', file=f_out_entry)

                    elif 30 not in dic_tracks:
                        sorting_lines(f_out_e, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        transposed_lines(f_out_e_t, dic_tracks, replicate_id, cell_tracker, bleach_correction)
                        print(replicate_id, cell_tracker, tr_id_global, 'entry', sep='\t', file=f_out_entry)

                    else:
                        print('neither longer nor shorter than 30 tracks?')
                
                #shouldn't be necessary anymore
                dic_tracks = {}
                dic_tracks[tr_frame_global] = []
                dic_tracks[tr_frame_global].append(line)
                id_identifier = tr_id_global
            
            #trouble shooting
            else:
                print('problems reaching last line?')
                        
                #empty the dic after it is stored in the file and add the data from this line that is from
                #a new track ID
                #print ('lasted longer than 10 frames')
                
            
    
    
def sorting_lines (file_name, dictionary, experiment, counter_cell, bleach):
    """
    This function reads the data from the dictionary that was sorted before according to frame (=key of dic).
    Data are then saved in a file.
    Input values are the file name (the handle from "with open" function) as file_name, the sorted dictionary
    (dictionary), the replicate id (experiment), a cell counter (counter_cell), and the value for 
    bleach correction (bleach)
    """
    
    for index, key in enumerate (dictionary):
        data = str(dictionary[key]).strip().split(',')
        tr_id = int(data [2])
        tr_frame = int(data [8])
        tr_median_1 = float(data [13])
        tr_median_2 = float(data [19])
        tr_median_3 = float(data [25])
        tr_contrast_1 = float(data [32])
        tr_contrast_2 = float(data [34])
        tr_contrast_3 = float(data [36])
        print(experiment, counter_cell, tr_id, tr_frame, 
              tr_median_1, (tr_median_1+tr_median_1*((tr_frame-1)*((1-bleach)/29))),
              tr_median_2, (tr_median_2+tr_median_2*((tr_frame-1)*((1-bleach)/29))),
              tr_median_3, (tr_median_3+tr_median_3*((tr_frame-1)*((1-bleach)/29))),
              tr_contrast_1, tr_contrast_2, tr_contrast_3, 
              sep = '\t', file = file_name)


def transposed_lines (file_name, dictionary, experiment, counter_cell, bleach):
    """
    This function writes a new file that contains all values for a given features (e.g., mean intensity
    channel 2) in separate rows per track. Values per track will be sorted according to ascending 
    frame number from left to right. Intended to help with analysis in excel etc.
    
    Input values are the file name (the handle from with open function) as file_name, the sorted dictionary
    (dictionary), the replicate id (experiment), and the value for bleach correction (bleach).
    """
    
    #print replicate nr first but stay in same line after printing
    print (experiment, counter_cell, file = file_name, sep='\t', end='\t')
    
    #write track ID into file
    for index, key in enumerate (dictionary):
        data = str(dictionary[key]).strip().split(',')
        tr_id = int(data[2])        
    print (tr_id, file = file_name, end='\t')
        
    #loop through dic and write respective values into file but always in the same line   
    for index, key in enumerate (dictionary):
        data = str(dictionary[key]).strip().split(',')
        #only median 2 used at the moment, all others only to change quickly
        tr_frame = int(data [8])
        tr_median_1 = float(data [13])
        tr_median_2 = float(data [19])
        tr_median_3 = float(data [25])
        tr_contrast_1 = float(data [32])
        tr_contrast_2 = float(data [34])
        tr_contrast_3 = float(data [36])
        #correct median 2 value for bleaching and write into file
        print ((tr_median_2+tr_median_2*((tr_frame-1)*((1-bleach)/29))), file = file_name, end='\t')
    
    #when dic is done, print \n to start new line for next dic from another track ID
    print (file = file_name, end='\n')


In [35]:
def write_overview (file_path_overview, file_path_out_overview):
    """
    Function to write an overview file that contains all data from "branching" files for better overview.
    
    Input is file path of input and output file.
    """
    
    with open (file_path_overview, 'r') as f_in, open (file_path_out_overview, 'a') as f_out:
        
        # Test if output file is empty to write header if still empty
        if is_file_empty_3(file_path_out_overview):
            
            print ('replicate', 'cell', 'ID', 'track duration (min)', 
                   'track distance (micron)', 'track velocity (micron/min)', 
                   sep='\t', file=f_out)
            
        f_in.readline()
        
        for line in f_in:
            data = line.strip().split('\t')
            track_rep = int(data [0])
            track_cell = int(data [1])
            track_id = int(data [2])
            track_duration = float(data [3])
            track_distance = float(data [4])
            track_veloc = float(data [5])
            
            print (track_rep, track_cell, track_id, track_duration, track_distance,
                  track_veloc, sep='\t', file=f_out)
            
def write_overview_tracks_trans (file_path_overview, file_path_out_overview):
    """
    Function to write an overview file that contains all data from transposed "tracking" 
    files for better overview.
    
    Input is file path of input and output file.
    """
    
    with open (file_path_overview, 'r') as f_in, open (file_path_out_overview, 'a') as f_out:
        
        if is_file_empty_3(file_path_out_overview):
            print ('ID', 'cell', 'track', 'median int 2 ch 2 corrected', 
                   sep='\t', file=f_out)
            
        f_in.readline()
        
        for line in f_in:
            data = line.strip().split('\t')
            for i,indexer in enumerate(data): 
                print ((data[i]), sep='\t', file=f_out, end='\t')
                
            print(end='\n', file =f_out)
            
def write_overview_tracks_trans_firstlast (file_path_overview, file_path_out_overview):
    """
    Function to write an overview file that contains all data from transposed "tracking" 
    files for better overview.
    
    Input is file path of input and output file.
    """
    
    with open (file_path_overview, 'r') as f_in, open (file_path_out_overview, 'a') as f_out:
        
        if is_file_empty_3(file_path_out_overview):
            print ('ID', 'cell', 'track', 'median int 2 ch 2 corrected', 
                   sep='\t', file=f_out)
            
        f_in.readline()
        
        for line in f_in:
            data = line.strip().split('\t')
            
            print (data[0],data[1],data[2],data[3],data[4],data[5], 
                   data[-3], data[-2], data[-1], sep='\t', file=f_out, end='\t')
                
            print(end='\n', file =f_out)
            

            
def write_overview_tracks (file_path_overview, file_path_out_overview):
    """
    Function to write an overview file that contains all data from "tracking" files for better overview.
    
    Input is file path of input and output file.
    """
    
    with open (file_path_overview, 'r') as f_in, open (file_path_out_overview, 'a') as f_out:
        
        if is_file_empty_3(file_path_out_overview):
            print ('replicate', 'median int 2 ch 2 corrected', 
                   sep='\t', file=f_out)
            
        f_in.readline()
        
        for line in f_in:
            data = line.strip().split('\t')
            replicate = int(data[0])
            median_2 = float(data[7])
            print(replicate, median_2, sep='\t', file=f_out)

          

        
 

In [36]:
# Creates the new files for every single output file and also creates the overview file entry/no entry for 
# for every track

# Import Module
import os

# Change the directory to the one given in the first code cell with variable "file_all"
os.chdir(path_all)  

# index for file numbers, used to identify data from individual cells and to write 
# header at the start of output files; currently not used anymore because the actual cell nr from the file
# name are used now and because function "is file empty" decides about writing the header
index = 1

# iterate through all files in path defined by variable "path_all"; 
for file in os.listdir():
    
    # Check whether file is in csv format or not
    if file.endswith(".csv"):
        
        series_tracker = ident_cell(file)
        replicate = ident_replicate(file)
        
        file_path = f"{path_all}{file}"
        
        #decide if input file is a summary file (branch) or with data of single points per track (track)
        if 'branch' in file:
            create_sum(file_path, file, index, replicate, series_tracker)
        elif 'track' in file:
            create_detail(file_path, file, index, replicate, bleach_cor, series_tracker)
        index += 1
            
    
        
        

In [37]:
# creates overview files that contain all information for the averaged data (branch files) and all median 2 int
# values after bleach correction in either column format or row format

# Import Module
import os

# Change the directory to the one given in the first code cell with variable "file_sum"
os.chdir(path_all)

file_branch_overview = directory + 'branching_overview_all.tsv'

file_track_trans_overview_entry = directory + 'tracks-trans_med2c_overview_entry.tsv'
file_track_trans_overview_no_entry = directory + 'tracks-trans_med2c_overview_no-entry.tsv'
file_track_trans_overview_entry_firstlast = directory + 'tracks-trans_med2c_overview_entry_firstlast.tsv'
file_track_trans_overview_no_entry_firstlast = directory + 'tracks-trans_med2c_overview_no-entry_firstlast.tsv'
file_track_overview_entry = directory + 'tracks_med2c_overview_entry.tsv'
file_track_overview_no_entry = directory + 'tracks_med2c_overview_no-entry.tsv'

for file in os.listdir():
    
    # Check whether file is in text format or not
    if file.endswith(".tsv"):
        
        series_tracker = ident_cell(file)
        file_path = f"{path_all}{file}"
        
        if 'branch' in file:
            write_overview(file_path, file_branch_overview)
        elif 'track' in file:
            if ('median2' in file) and ('_entry_' in file):
                write_overview_tracks_trans(file_path, file_track_trans_overview_entry)
                #write_overview_tracks_trans_firstlast(file_path, file_track_trans_overview_entry_firstlast)
            elif ('median2' in file) and ('_NOentry_' in file):
                write_overview_tracks_trans(file_path, file_track_trans_overview_no_entry)
                #write_overview_tracks_trans_firstlast(file_path, file_track_trans_overview_no_entry_firstlast)
            elif ('bleach' in file) and ('_entry_' in file):
                write_overview_tracks(file_path, file_track_overview_entry)
            elif ('bleach' in file) and ('_NOentry_' in file):
                write_overview_tracks(file_path, file_track_overview_no_entry)
        

In [38]:
# create new overview files for the averaged data depending if a track was identified before to enter or not


#file to decide of track entered or not
file_enter = path_all + 'entry_vs_no-entry_file.tsv'

#branching file that has to be split into entry vs no entry
file_branching = directory + 'branching_overview_all.tsv'

file_branch_enter = directory + 'branches_enter.tsv'
file_branch_no_enter = directory + 'branches_noenter.tsv'

with open (file_branching, 'r') as f_in_branch, open (file_enter, 'r') as f_in_enter, open (file_branch_enter, 'w') as f_out_enter, open (file_branch_no_enter, 'w') as f_out_noenter:
    
    list_entry = []
    list_noentry = []
    
    f_in_enter.readline()
    f_in_branch.readline()
    
    
    print ('replicate', 'cell', 'ID', 'track duration (min)', 
                   'track distance (micron)', 'track velocity (micron/min)', 
                   sep='\t', file=f_out_enter)
    print ('replicate', 'cell', 'ID', 'track duration (min)', 
                   'track distance (micron)', 'track velocity (micron/min)', 
                   sep='\t', file= f_out_noenter)
    
    for line in f_in_enter:
        data = line.strip().split('\t')
        identifier = data[0] + data[1] + data[2]
        if data[-1] == 'entry':
            list_entry.append(identifier)
        elif data[-1] == 'no entry':
            list_noentry.append(identifier)
    
    
    for line in f_in_branch:
        data = line.strip().split('\t')
        identifier = data[0] + data[1] + data[2]
        if identifier in list_entry:
            for i,indexer in enumerate(data): 
                print ((data[i]), sep='\t', file=f_out_enter, end='\t')
                
            print(end='\n', file =f_out_enter)
        elif identifier in list_noentry:
            for i,indexer in enumerate(data): 
                print ((data[i]), sep='\t', file=f_out_noenter, end='\t')
                
            print(end='\n', file =f_out_noenter)
        else:
            print('problem', identifier)



problem 2022113021
problem 2022113023
problem 2022113024
problem 2022113025
