In [1]:
import os
import numpy as np
import warnings

In [2]:
_end_tags = dict(grid = ':HEADER_END:', scan = ':SCANIT_END:', spec = '[DATA]')

class NanonisFile(object):
    '''
    Base class for Nanonis data files (grid, scan & point spectroscopy).
    Handles methods and parsing tasks common to all Nanonis files.

    Parameters
    ----------
    fname : str
        Name of Nanonis file.
    
    Attributes
    ----------
    datadir : str
        Directory path for Nanonis file.
    basename : str
        Filename without path.
    fname : str
        Full path of Nanonis file.
    filetype : str
        filetype corresponding to filename extension.
    byte_offset : int
        Size of header in bytes.
    header_raw : str
        Unproccessed header information.
    '''

    def __init__(self, fname):
        self.datadir, self.basename = os.path.split(fname)
        self.fname = fname
        self.filetype = self._determine_filetype()
        self.byte_offset = self.start_byte()
        self.header_raw = self.read_raw_header(self.byte_offset)

    def _determine_filetype(self):
        '''
        Check last four characters for appropriate file extension, raise error if not.

        Returns
        -------
        str
            Filetype name associated with extension.
        
        Raises
        ------
        UnhandledFileError
            If last three characters of filename are not one of '3ds', 'sxm', or 'dat'.
        '''
        _, fname_ext = os.path.splitext(self.fname)
        if fname_ext == '.3ds':
            return 'grid'
        elif fname_ext == '.sxm':
            return 'scan'
        elif fname_ext == '.dat':
            return 'spec'
        else:
            raise UnhandledFileError(
            ('{} is not a supported filetype or does not exist.'.format(self.basename))
            )

    def read_raw_header(self, byte_offset):
        '''
        Return header as a raw string.

        Everything before the end tag is considered to be part of the header. The parsing will be done later by
        subclass methods.

        Parameters
        ----------
        byte_offset : int
            Size of header in bytes. Read up to this point in file.
        
        Returns
        -------
        str
            Contents of filename up to byte_offset as a decoded binary string.
        '''

        with open(self.fname, 'rb') as f:
            return f.read(byte_offset).decode('utf-8', errors = 'replace')
    
    def start_byte(self):
        '''
        Find first byte after end tag signalling end of header info.
        CaveatCaveat, I believe this is the first byte after the end 
        of the line that the end tag is found on, not strictly the 
        first byte directly after the end tag is found. For example 
        in Scan __init__, byte_offset is incremented by 4 to account 
        for a 'start' byte that is not actual data.

        Returns
        -------
        int
            Size of header in bytes.
        '''
        with open(self.fname, 'rb') as f:
            tag = _end_tags[self.filetype]

            # Set to a default value to know if end_tag wasn't found
            byte_offset = -1

            for line in f:
                # Convert from bytes to str
                try:
                    entry = line.stripe().decode()
                except UnicodeDecodeError:
                    warnings.warn(
                        ('{} has non-uft-8 charaters, replacing them.'.format(f.name))
                    )
                    entry = line.strip().decode('utf-8', errors = 'replace')
                if tag in entry:
                    byte_offset = f.tell()
                    break
            if byte_offset == -1:
                raise FileHeadNotFoundError(
                ('Could not find the {} end tag in {}'
                .format(tag, self.basename))
                )
        return byte_offset

# Error Class        
class UnhandledFileError(Exception):
    '''
    To be raised when unknown file extension is passed
    '''
    pass
class FileHeaderNotFoundError(Exception):
    '''
    To be raised when unknow file extension is passed.
    '''
    pass

class Scan(NanonisFile):
    '''
    '''

# .sxm header    
def _parse_sxm_header(header_raw):
    '''
    parse raw header string.

    Parameters
    ----------
    header_raw : str
        Raw header string from read_raw_header() method.

    Returns
    -------
    dict
        Channel name keyed dict of each channel array.
    '''
    header_entries = header_raw.split('\n')
    header_entries = header_entries[:-3]

    header_dict = dict()
    entries_to_be_split = ['SCAN_PIXELS',
                           'SCAN_TIME',
                           'SCAN_RANGE',
                           'SCAN_OFFSET']
    entries_to_be_floated = ['ACQ_TIME',
                             'SCAN_TIME',
                             'SCAN_RANGE',
                             'SCAN_OFFSET',
                             'BIAS']
    entries_to_be_inted = ['SCAN_PIXELS']

# table header
    for i, entry in enumerate(header_entries):
        if entry == ':DATA_INFO:' or entry == ':Z-CONTROLLER:':
            count = 1
            for j in range(i + 1, len(header_entries)):
                if header_entries[j].startwith(':'):
                    break
                if header_entries[j][0] == '\t':
                    count += 1
            header_dict[entry.strip(':').lower()] = _parse_scan_header_table(
                header_entries[i + 1: i + count])
            continue
        if entry.startwith(':'):
            header_dict[entry.strip(':').lower()] = (header_entries[i + 1]
                                                    .strip())
# header need to be splited
    for key in entries_to_be_split:
        header_dict[key] = header_dict[key].split()
        

In [3]:
fname = r'/Users/hunfen/OneDrive/General Files/STM1500_Nanonis_data/2020/2020-10-12/Topography010.sxm'
_, basename = os.path.split(fname)
with open(fname, 'rb') as f:
    tag = _end_tags['scan']
    byte_offset = -1
    for line in f:
        # Convert from bytes to str
        try:
            entry = line.strip().decode()
        except UnicodeDecodeError:
            warnings.warn(
                ('{} has non-uft-8 characters, replacing                        them.'.format(f.name))
                    )
            entry = line.strip().decode('utf-8',errors='replace')
        if tag in entry:
            byte_offset = f.tell()
            break
    if byte_offset == -1:
        raise FileHeaderNotFoundError(
            ('Could not find the {} end tag in {}.'
             .format(tag, basename))
        )
print(byte_offset)

1050
