# QC Transfer Notebook

This notebook exemplifies the usage of AlphaPept in a continuous QC setting.

The script compares the processed files in the `FINISHED_PATH`  with the files in the maintenance folder. In this script, there are folders for Bruker - `BRUKER_MAINTENANCE_FOLDER` and Thermo - `THERMO_MAINTENANCE_FOLDER`. If a new file is found, it will be copied locally, the `REFERENCE_SETTINGS` will be adjusted, and the file will be added to the processing queue. Once the file is finished, the next file will be processed. Additionally, a SLACK notification is sent after a file is processed.

In order for the script to work, the AlphaPept GUI needs to run in the background.

Depreciation Note: Waiting for file sizes to change has proven to be not ideal, especially for Bruker files. Here it makes more sense to check if temporary files ('*.tdf-shm', '*.tdf-wal') are still present.

In [None]:
import os
from shutil import copyfile, rmtree 
import time
from distutils.dir_util import copy_tree
import sys
import shutil
from operator import attrgetter
from collections import namedtuple
import stat
from slack_sdk.webhook import WebhookClient
from datetime import datetime
from alphapept.paths import FINISHED_PATH, QUEUE_PATH
from alphapept.settings import load_settings_as_template, save_settings, load_settings


THERMO_MAINTENANCE_FOLDER = 'Z:/maintenance/2019'
tag = 'MA_HeLa'
BRUKER_MAINTENANCE_FOLDER = 'Y:/'
TARGET_FOLDER = 'D:/temp_folder'

REFERENCE_SETTINGS = 'D:/reference_qc_2/results.yaml'
PROCESS_BATCH = 2 #Process 2 files before refreshing
UPDATE = 5 #Update every 5 minutes
SUBFOLDERS = ['tims01',
 'tims02',
 'tims03',
 'tims04',
 'tims05']

# Webhook:
webhook = ''

In [None]:
url = webhook
webhook = WebhookClient(url)
qcfile = namedtuple('QcFile',('fullpath','filepath','filepath_no_end'))
settings_ = load_settings_as_template(REFERENCE_SETTINGS)

In [None]:
def delete_folder(base_path):
    for file in os.listdir(base_path):
        full_path = os.path.join(base_path, file)
        if os.path.isdir(full_path):
            delete_folder(full_path)
            os.chmod(full_path, stat.S_IWUSR)
            os.rmdir(full_path)
        else:
            os.remove(full_path)
            
def get_folder_size(start_path: str = ".") -> float:
    """Returns the total size of a given folder.
    Args:
        start_path (str): Path to the folder that should be checked. Defaults to '.'.
    Returns:
        float: Total size in Mb.
    """

    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size/(1024**2)


def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)

def get_tims_qc(SUBFOLDERS, tag):
    """
    Get tims qc files from folders 
    
    """

    files_ = []
    for folder_ in SUBFOLDERS:
        base_path = os.path.join(BRUKER_MAINTENANCE_FOLDER, folder_)
        for _ in os.listdir(base_path):
            new_path = os.path.join(base_path, _)
            if os.path.isdir(new_path):
                for k in os.listdir(new_path):
                    if tag in k:
                        files_.append(os.path.join(new_path, k))
            
    return files_

def get_thermo_qc():
    thermo_files = os.listdir(THERMO_MAINTENANCE_FOLDER)
    return [os.path.join(THERMO_MAINTENANCE_FOLDER, _) for _ in thermo_files if _.lower().endswith('.raw')]

def get_known_files():
    known_files = os.listdir(FINISHED_PATH)
    known_files = [_[:-5] for _ in known_files if _.endswith('.yaml')]
    
    failed_files = os.listdir(FINISHED_PATH)
    failed_files = [_[:-5] for _ in known_files if _.endswith('.yaml')]
    
    known_files = known_files + failed_files
    
    return known_files

def compare_thermo(thermo_qc, known_files):
    
    main_files = [_[:-4] for _ in thermo_qc]
    new_files = set(main_files) - set(known_files)
    new_files = list(new_files)
    new_files.sort()
    new_files = new_files[::-1]
    
    return new_files

def compare_bruker(bruker_qc, known_files):
    main_files = [os.path.split(_[:-2])[1] for _ in bruker_qc]
    new_files = set(main_files) - set(known_files)
    new_files = list(new_files)
    new_files.sort()
    new_files = new_files[::-1]
    
    return new_files

def compare(qc_files, known_files):
    new_files = set([_.filepath_no_end for _ in qc_files]) - set(known_files)
    new_files = [_ for _ in qc_files if _.filepath_no_end in new_files]
    new_files = sorted(new_files, key=attrgetter('filepath_no_end'))

    return new_files[::-1]
    
def copy_thermo(file):
    src = file.fullpath
    dst = os.path.join(TARGET_FOLDER, file.filepath)
    copyfile(src, dst)

def copy_bruker(file):
    src = file.fullpath
    dst = os.path.join(TARGET_FOLDER, file.filepath)
    
    dst_folder = os.path.join(TARGET_FOLDER, os.path.split(file.fullpath)[1])
    
    if not os.path.isdir(dst_folder):
        os.mkdir(dst_folder)
    
    copytree(src, dst)
    
    time.sleep(10)
    
def prepare_settings(settings_, file_):
    file_ = os.path.join(TARGET_FOLDER, file_)

    settings = settings_.copy()
    settings["experiment"]["file_paths"] = [file_]
    new_file = os.path.splitext(os.path.split(file_)[1])[0] + ".yaml"
    settings["experiment"]["results_path"] = (os.path.splitext(file_)[0] + ".yaml")
    
    exp_file = os.path.join(QUEUE_PATH, new_file)
    
    size_ = get_folder_size(TARGET_FOLDER)
    
    skip = False
    
    if size_ > 100:
        save_settings(settings, exp_file)
    else:
        print('File is too small. Skipping file.')
        exp_file_ = os.path.join(FINISHED_PATH, new_file)
        save_settings(settings, exp_file_)
        
        response = webhook.send(text=f"File {os.path.split(exp_file)[1][:-5]} with size {size_:.2f} Mb skipped.")
        skip = True
    return exp_file, skip

def send_file_summary(settings):

    sub = settings['summary'].copy()
    file = os.path.splitext(sub['processed_files'][0])[0]
    file_preview = sub[file]

    fields = ['features (n in table)','rt_tail (feature_table, median)','id_rate (peptide_fdr)','protein_group (protein_fdr, n unique)']

    text = f"*{file}* in {sub['timing']['total (min)']:.2f} minutes \n"
    
    
    if file_preview['acquisition_date_time']:
        time = datetime.strptime(file_preview['acquisition_date_time'].split('.')[0], "%Y-%m-%dT%H:%M:%S").strftime("%Y-%m-%d %H:%M:%S")
    else:
        time = 'NaN'
        
    text += f"{'acquisition_date_time':<38} {time}\n"

    for _ in fields:
        
        if _ in file_preview.keys():

            if type(file_preview[_]) is int:
                field_format = f"{file_preview[_]:,}"
            elif type(file_preview[_]) is float:
                field_format = f"{file_preview[_]:,.3f}"
            else:
                field_format = f"{file_preview[_]}"

            text += f"{_:<38} {field_format}\n"
        else:
            text += f"{_:<38} NaN\n"
            

    response = webhook.send(text=text)

def wait_for_file(exp_file):
    start = time.time()
    print(f"Processing {exp_file}")
    while os.path.isfile(exp_file):
        curr = time.time()
        elapsed = (curr-start)/60
        sys.stdout.write('\r'+ f"Time elapsed {elapsed:,.2f} minutes.")   
        time.sleep(1)
    print('\nDone')
        
def reset_temp_folder():
    print('Starting folder cleanup.')
    time.sleep(5)
    
    success = False
    
    while not success:
        if not os.path.isdir(TARGET_FOLDER):
            success = True
        else:
            try:
                delete_folder(TARGET_FOLDER)
                success = True
            except PermissionError:
                print('Deleting not successfull. Waiting for 60s and restarting.')
                time.sleep(60)
    if not os.path.isdir(TARGET_FOLDER):
        os.mkdir(TARGET_FOLDER)

def process_new(files, settings_):
    
    if len(files) > 0:
        file = files[0]
        file_ = file.fullpath
        
        skip = False
        
        if file_.endswith('.raw'):
            try:
                copy_thermo(file)
            except PermissionError:
                skip = True
        elif file_.endswith('.d'):
            copy_bruker(file)
        else:
            raise NotImplementedError
        
        if not skip:
            exp_file, skip_ = prepare_settings(settings_,  os.path.join(TARGET_FOLDER, file.filepath))
            if not skip_:
                wait_for_file(exp_file)
                
                new_file = os.path.splitext(os.path.split(file_)[1])[0] + ".yaml"
                processed_file = os.path.join(FINISHED_PATH, new_file)
                
                if os.path.isfile(processed_file):
                    response = webhook.send(text=f"File {os.path.split(exp_file)[1][:-5]} finished.")
                    results = load_settings(processed_file)
                    send_file_summary(results)
                else:
                    response = webhook.send(text=f"File {os.path.split(exp_file)[1][:-5]} failed.")

            reset_temp_folder()
        else:
            response = webhook.send(text=f"Tried to process {file_} but seems not ready yet.")
        
def wait_for_unprocessed():
    unprocessed = [_ for _ in os.listdir(QUEUE_PATH) if _.endswith('.yaml')]
        
    start = time.time()
    print(f"Waiting for {unprocessed}")
    while len(unprocessed) > 0:
        curr = time.time()
        elapsed = (curr-start)/60
        sys.stdout.write('\r'+ f"Time elapsed {elapsed:,.2f} minutes.")   
        time.sleep(1)
        unprocessed = [_ for _ in os.listdir(QUEUE_PATH) if _.endswith('.yaml')]
    print('\nDone')
    

In [None]:
wait_for_unprocessed()

while True:

    tims_qc = get_tims_qc(SUBFOLDERS, tag)
    thermo_qc = get_thermo_qc()
    
    tims_qc = [qcfile(fullpath=_, filepath = os.path.split(_)[1], filepath_no_end=os.path.splitext(os.path.split(_)[1])[0]) for _ in tims_qc]
    thermo_qc = [qcfile(fullpath=_, filepath = os.path.split(_)[1], filepath_no_end=os.path.splitext(os.path.split(_)[1])[0]) for _ in thermo_qc]

    print(f'Found {len(tims_qc):,} TIMS QC and {len(thermo_qc):,} Thermo QC on remote repository.')

    known_files = get_known_files()

    new_tims = compare(tims_qc, known_files)
    new_thermo = compare(thermo_qc, known_files)

    print(f'A total of {len(new_tims):,} TIMS QC and {len(new_thermo):,} Thermo QC are unprocessed.') 
    
    ratio_tims = (1-(len(new_tims) / len(tims_qc)))*100
    ratio_thermo = (1-(len(new_thermo) / len(thermo_qc)))*100
    
    print(f'A total of {ratio_tims:.2f} % TIMS QC and {ratio_thermo:.2f} % Thermo QC are processed.') 

    for i in range(PROCESS_BATCH):
        process_new(new_thermo[i:], settings_)
        process_new(new_tims[i:], settings_)    
    
    print('Waiting for the next update.')
    time.sleep(UPDATE*60)