In [None]:
#!pip install swig
#!pip install Boost
#!pip install xylib-py
# !git clone https://github.com/wojdyr/xylib.git
# !pip install git+https://github.com/wojdyr/xylib.git
# !python setup.py install

In [None]:
import xylib as xy

import glob, os 

#data management
import numpy as np, pandas as pd

#plotting
import matplotlib.pyplot as plt

xylib is a library for reading files that contain x-y data from powder diffraction, spectroscopy or other experimental methods.
It is recommended to set LC_NUMERIC="C" (or other locale with the same numeric format) before reading files.  Usually, we first call load_file() to read file from disk. It stores all data from the file in class DataSet.
  DataSet contains a list of Blocks, each Blocks contains a list of Columns,
  and each Column contains a list of values.
 
  It may sound complex, but IMO it can't be made simpler.
  It's analogical to a spreadsheet. One OOCalc or Excel file (which
  corresponds to xylib::DataSet) contains a number of sheets (Blocks),
  but usually only one is used. Each sheet can be viewed as a list of columns.
 
  In xylib all columns in one block must have equal length.
  Several filetypes always contain only one Block with two Columns.
  In this case we can take coordinates of the 15th point as:
     double x = get_block(0)->get_column(1)->get_value(14);
     double y = get_block(0)->get_column(2)->get_value(14);
  Note that blocks and points are numbered from 0, but columns are numbered
  from 1, because the column 0 returns index of point.
  All values are stored as floating-point numbers, even if they are integers
  in the file.
  DataSet and Block contain also MetaData, which is a string to string map.
 
  Note that C++ API uses std::string and exceptions, so it is recommended
  to compile the library and programs that use it with the same compiler.
 
  C++ API is defined in xylib namespace, C API use prefix xylib.
 /

In [None]:

def __searchFile(mainDir: str, subDirs: list = [''], selected_file: list = [], extensions: list = ['.raw'], recursive: bool = False, debug: bool = False):
    """
    Look inside the folder specified to generate the list of filenames.
    
    - Arguments:
    mainDir: str, 
    subDirs: list = [], 
    selected: list = [], 
    extensions: str = '.raw', 
    
    - Return:
    selected: list of found filenames
    """
    if debug:
        print(f"""
        mainDir: {mainDir},
        subDirs: {subDirs},
        selected_file: {selected_file},
        extensions: {extensions},
        recursive: {recursive}, 
        debug: {debug}
        """)
    
    #variable declaration
    selected = []  #container for loaded filenames
    
    #troubleshoot arguments
    if type(subDirs) != list:
        warnings.warn('subDirs should be a list. Now is converted to a list')
        subDirs = [subDirs]
    else:
        if len(subDirs) == 0:
            warnings.warn('detected empy subDirs list, set it to empty string \'\'')
            subDirs = ['']
    if type(selected_file) != list:
        warnings.warn('selected should be a list. Now is converted to a list') 
        selected_file=[selected_file]
    if type(extensions) != list:
        warnings.warn('ext should be a list. Now is converted to a list')
        extensions=[extensions]

    #import files in selected 
    if len(selected_file) == 0:
        #No user-defined request
        if debug:
            print(f" - __searchFile: looking inside specified subfolders of mainDir and their sobfolders")
        for subDir in subDirs:
            #look into all subfolders
            if subDir != '' and subDir.endswith('/') == False:
                subDir = subDir + '/'
            if debug:
                print(f" - __searchFile: looking in {subDir} subfolder")
            if recursive:
                for extension in extensions:
                    print(mainDir+subDir)
                    selections = glob.glob(mainDir+subDir+'**/*'+extension, recursive=recursive)   # if filenames is empty, take all folders/files inside maindir
                    for selection in selections:
                        selected.append(selection)    #ensure selected is a list of str not a list of list 
            else:
                if debug:
                    print(f" - __searchFile: looking only inside the spcified subfolders of mainDir.")           
                for extension in extensions:
                    selections = glob.glob(mainDir+subDir+'*'+extension, recursive=recursive)   # if filenames is empty, take all folders/files inside maindir          
                    for selection in selections:
                        selected.append(selection)    #ensure selected is a list of str not a list of list 
    else:
        if debug:
            print(" - __searchFile: loading user selected files")
        for file in selected_file:
            file = mainDir+file
            if debug:
                print(f" - __searchFile: looking for {file}")
            if os.path.exists(file) == True:
                selections = glob.glob(file, recursive=recursive)
                for selection in selections:
                    selected.append(selection)    #ensure selected is a list of str not a list of list 
            else:
                for extension in extensions: 
                    if debug:
                        print(f" - __searchFile: looking for {file+extension}")
                    selections = glob.glob(file+extension, recursive=recursive) # if file doesn't exist, returns an empty list
                for selection in selections:
                    selected.append(selection)    #ensure selected is a list of str not a list of list 
        
    #sort selected files
    selected.sort()
    if debug:
        print(f" - __searchFile: found {len(selected)} files")
    return selected
        
def raw2xyN(mainDir: str, subDirs: list = [], selected_file: list = [], ext: str = '.raw', recursive: bool = False, optimize: bool = True, convertraw: bool = True, expInSubDir: bool = False, debug: bool = False):
    """
    Convert given .raw files into .xy and .xyn files. .xyn files are normalized data 
    for the time per step used during the data acquisition. The methods then save the read files as .xy and .xyn in the same folder or in a subfolder named ./exported
    
    - Arguments
    mainDir: str, 
    subDirs: list = [], 
    selected: list = [], 
    ext: str = '.raw', 
    optimize: bool = True, check if converted files .xy, .xyn exists. in that case avoid conversion 
    convertraw: bool = True, 
    expInSubDir: bool = False, 
    debug: bool = False):
    
    - Return
    None
    """
    
    nConverted = 0
    formatErrors = 0 # to count the number of errors in the conversion procedure due to non-supported raw files.
    formatErrorsFilename = [] # track filenames which are in unsupoprte format
    convertedFilenames = [] # track filenames which are in unsupoprte format
    expInSubDir = expInSubDir
    
    import xylib as xy

    #variable declaration
    paths = []
    files = []  #is a list of filenames choosen
    dataset = []  #is a list of np.array rapresenting the whole dataset
    
    
    selected = __searchFile(mainDir = mainDir, subDirs = subDirs, selected_file = selected_file,
                            extensions = ext, recursive = recursive, debug = debug)
    selectedXY = __searchFile(mainDir = mainDir, subDirs = subDirs, selected_file = selected_file,
                             extensions = ['.xy'], recursive = recursive, debug = debug)
    selectedXYN = __searchFile(mainDir = mainDir, subDirs = subDirs, selected_file = selected_file,
                            extensions = ['.xyn'], recursive = recursive, debug = debug)
    if debug:
        print(f'sorting the {len(selected)} selected files by the name:', selected)

    if optimize:
        #look for already converted files. delete those from selected list
        selectedToRemove = []   #container for the indexes to be removed
        selectedNormalizedToRemove = []  #container for the indexes with .xyn extension already present to be removed
        discardedFilenames = [] # track filenames which are already converted
        for i in range(0,len(selected)):
            if selected[i][:-3]+'xy' in selectedXY:
                selectedToRemove.append(i)
            if selected[i][:-3]+'xyn' in selectedXYN:
                selectedNormalizedToRemove.append(i)   #future development, detect which raw are not converted as normalized, then export the converted raw ONLY aas .xyn
        if len(selectedToRemove) != 0:
            if debug:
                print(f"found {len(selectedToRemove)} already converted files")
            for index in range(1, len(selectedToRemove)+1):
                selectedDiscard = selected.pop(len(selectedToRemove)-index)  #pop items on reverse so that indexes won't change during the popping procedure.
                discardedFilenames.append(os.path.basename(selectedDiscard))
            selected.sort()
            if debug:
                print(f'sorting the {len(selected)} updated selected files by the name:', selected)
    
    #start the conversion
    for pathFile in selected:
        if debug:
            print(f'converting {os.path.basename(pathFile)}\n')
        try:
            file = xy.load_file(pathFile)
        except RuntimeError:
            warnings.warn('File conversion error.')
            print(f"!!! raw format not supported for {os.path.basename(pathFile)}")
            formatErrors +=1
            formatErrorsFilename.append(os.path.basename(pathFile))
            continue 

        #usually raws are made of 1 block
        block = file.get_block(0)

        #extract the two columns, even though seems that col1 doesn't contain 2theta values
        col1 = block.get_column(1)
        col2 = block.get_column(2)
        if debug:
            print(f"file: {file}, n blocks: {file.get_block_count()}, n cols in block: {block.get_column_count()}")

        #extract meta information from which we can build-up 2theta axis.
        meta = block.meta
        keys = []   #container for keys

        #get the keys' name
        for i in range(0,12):
            keys.append(meta.get_key(i))

        #loop over keys to get their values
        for key in keys:
            if debug:
                print(key ,meta.get(key))
        startth = float(meta.get('START_2THETA'))
        stepsize = float(meta.get('STEP_SIZE'))
        nstep = float(meta.get('STEPS'))
        tps = float(meta.get('TIME_PER_STEP'))
        lam = float(meta.get('USED_LAMBDA'))

        # create np array for x
        npx = np.arange(startth,startth+stepsize*nstep,stepsize)
        if debug:
            print(f"{np.shape(npx)} points on x axis" )
            print(f"x values loaded from {npx[0]} to {npx[-1]} with stepsize {npx[1]-npx[0]}")

        # extract y values
        valy = []
        for index in range(0, col2.get_point_count()):
            valy.append(col2.get_value(index))

        # create numpy array
        npy = np.array(valy)
        if debug:
            print(f"{np.shape(npy)} points on y axis" )
        npnorm = npy/tps   #normalize for time per step

        #build the dataset
        if np.shape(npy) != np.shape(npx):
            dataset = [npx[:-1], npy, npnorm]   #usually x points are 1 time longer.
        else:
            dataset = [npx, npy, npnorm]

        #generate np.array
        npdataset = np.array(dataset).T

        #build the head of the final .xy files
        head = f"time per step: {tps}\nlambda: {lam}\n2theta Counts NormalizedCounts"


        if expInSubDir == True:
            if not os.path.exists(path+'/export'):
                !mkdir {'\''+pathFile.strip(os.path.basename(pathFile))+'export/'+'\''}
            fnameXY = path+'export/'+os.path.basename(pathFile)[0:-4]+'.xy'
            fnameXYN = path+'export/'+os.path.basename(pathFile)[0:-4]+'.xyn'
        else:
            fnameXY = pathFile[0:-4]+'.xy'
            fnameXYN = pathFile[0:-4]+'.xyn'

        # save array to .xy file
        np.savetxt(fname=fnameXY, X=npdataset[:,:2], header=head)
        np.savetxt(fname=fnameXYN, X=npdataset[:,:], header=head)
        nConverted +=1
        convertedFilenames.append(os.path.basename(pathFile))
    
    if os.path.exists("raw2xyN.log"):
        with open("raw2xyN.log", "w") as f:
            f.write(str(nConverted) +" files succesfully converted \n "+ str(formatErrors) +"  files are in a non-supported format \n" + str(len(discardedFilenames)) + " have been discarded")
            f.write("\n\n##########################################\n converted files: \n")
            for item in convertedFilenames:
                # write each item on a new line
                f.write("%s\n" % item)
            f.write("\n\n##########################################\n files in a non-supported format: \n")
            for item in formatErrorsFilename:
                # write each item on a new line
                f.write("%s\n" % item)
            f.write("\n\n##########################################\n files discarded: \n")
            for item in discardedFilenames:
                # write each item on a new line
                f.write("%s\n" % item)
                
    else:
        with open("raw2xyN.log", "x") as f:
            f.write(str(nConverted) +" files succesfully converted \n "+ str(formatErrors) +"  files are in a non-supported format \n" + str(len(discardedFilenames)) + " have been discarded")
            f.write("\n\n##########################################\n converted files: \n")
            for item in convertedFilenames:
                # write each item on a new line
                f.write("%s\n" % item)
            f.write("\n\n##########################################\n files in a non-supported format: \n")
            for item in formatErrorsFilename:
                # write each item on a new line
                f.write("%s\n" % item)
            f.write("\n\n##########################################\n files discarded: \n")
            for item in discardedFilenames:
                # write each item on a new line
                f.write("%s\n" % item)
    print('###########################################################################\n')
    print(f'Procedure terminated with {formatErrors} errors due to non-supported files. \n{nConverted} files have been converted.\n{len(discardedFilenames)} have been discarded.\nFor additional details inspect the .log file')
         

In [None]:
raw2xyN(mainDir = 'path to main Dir', ext = '.raw', recursive = True, 
        optimize = True, convertraw = convertraw, expInSubDir = False, debug = False)