In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib notebook

from util import DataFrameBase, xlim_expand
from tbtools.dev import IProgressBar, ProgressBar, subreload
from tbtools.filewrangling import find_beginning_of_end, header_matches
from tbtools.iter import impatient_search

In [None]:
IProgressBar = subreload(IProgressBar)

In [None]:
path = os.environ['HOME'] + "/Speciale/data/NN/"

In [None]:
def time_to_timedelta_pass_nans(val):
    if type(val) is float and np.isnan(val):
        return np.nan
    if type(val) is datetime.time:
        val = val.isoformat()
    return pd.to_timedelta(val)

def is_datetime_col(x):
    return 'time' in x.lower() or \
           'dato' in x.lower() or \
           'date' in x.lower()
        
def convert_timedelta(df):
    cols = [x for x in df if 'diff' in x.lower()]
    df[cols] = df[cols].applymap(time_to_timedelta_pass_nans)
    return df

def read_csv(name):
    p = path + name
    parse_dates = header_matches(p, ';', is_datetime_col)
    skip_footer = find_beginning_of_end(p, lambda x: x.startswith('EOF')) 
    df = pd.read_csv(p, sep=';', parse_dates=parse_dates, skip_footer=skip_footer)
    df = convert_timedelta(df)
    return df

def read_excel(name):
    p = path + name
    df = pd.read_excel(p)
    # Remove extraneous footer
    rem = impatient_search(df.iloc[::-1, 0], lambda x: isinstance(x, str) and x.startswith('EOF'))
    if rem is None: 
        rem = 0
    else:
        rem += 1        
    df = df[:-rem]
    df.replace('nan', np.nan)
    # Convert datecols
    parse_dates = list(filter(is_datetime_col, df.columns))
    df[parse_dates] = df[parse_dates].applymap(pd.to_datetime)
    df = convert_timedelta(df)
    return df

In [None]:
df = read_csv('NN.20. september.txt')

In [None]:
df2 = read_excel('NN.20. september.xlsx')

In [None]:
(df.fillna(0) == df2.fillna(0)).all().all()

## Are all the excel files completely equivalent to the csv files?

The following comparison shows that yes, yes they are. So we can ignore the excel files, as they are a subset of the data in the csv files.

In [None]:
ls = os.listdir(path)
exc = [f for f in ls if f.endswith('.xlsx')]
print('{} of {} files are excel format.'.format(len(exc), len(ls)))
csvpartners = [f[:-4]+'txt' for f in exc]
assert all([c in ls for c in csvpartners])
pairs = list(zip(exc, csvpartners))

equi = 0
for x, c in IProgressBar(pairs):
    dfx = read_excel(x)
    dfc = read_csv(c)
    if (df.fillna(0) == df2.fillna(0)).all().all():
#         print('√')
        equi += 1
    else:
        print('!! {} and {} are NOT equivalent.'.format(x, c))
        
print('{} of {} possible pairs are equivalent.'.format(equi, len(pairs)))

Output:

    14 of 44 files are excel format.
     [                  0%                  ]  0 of 14 complete['Timestamp', 'OrganTimestamp']
     [###               7%                  ]  1 of 14 complete['Timestamp', 'OrganTimestamp']
     [#####            14%                  ]  2 of 14 complete['Timestamp', 'OrganTimestamp']
     [########         21%                  ]  3 of 14 complete['Timestamp', 'OrganTimestamp']
     [###########      29%                  ]  4 of 14 complete['Timestamp', 'OrganTimestamp']
     [##############   36%                  ]  5 of 14 complete['Timestamp', 'OrganTimestamp']
     [################ 43%                  ]  6 of 14 complete['Timestamp', 'OrganTimestamp']
     [#################50%                  ]  7 of 14 complete['Timestamp', 'OrganTimestamp']
     [#################57%##                ]  8 of 14 complete['Timestamp', 'OrganTimestamp']
     [#################64%####              ]  9 of 14 complete['Timestamp', 'OrganTimestamp']
     [#################71%#######           ]  10 of 14 complete['Timestamp', 'OrganTimestamp']
     [#################79%##########        ]  11 of 14 complete['Timestamp', 'OrganTimestamp']
     [#################86%#############     ]  12 of 14 complete['Timestamp', 'OrganTimestamp']
     [#################93%###############   ]  13 of 14 complete['Timestamp', 'OrganTimestamp']
     [################100%##################]  14 of 14 complete
    Elapsed time: 0:04:43.405428
    14 of 14 possible pairs are equivalent.

## Reorder with correctly sorting date names

In [None]:
import re
import shutil

    ls = os.listdir(path)
    cs = [f for f in ls if f.endswith('.txt')]

    def extract_date(f):
        year = 2012
        murica = False
        if 'oktober' in f:
            month = 10
        elif 'september' in f:
            month = 9
        elif '.9-' in f:
            murica = True
            month = 9
        if not murica:
            day = re.findall('\d+', f)[0]
        else:
            day = re.findall('(?<=-)\d+(?=-)', f)[0]
        return datetime.date(year=int(year), month=int(month), day=int(day))

    originalsdir = os.path.join(path, 'original')
    os.makedirs(originalsdir)

    for orig in cs:
        n = extract_date(orig)
        newname = 'B{}'.format(n)
        shutil.copyfile(path+orig, path+newname)

    #     print('{}: {}'.format(f, extract_date(f)))

    for old in ls:
        shutil.move(path+old, os.path.join(originalsdir, old))

    # Append .csv to all files
    ls = os.listdir(path)
    for f in ls:
        if f.startswith('B') and not f.endswith('.csv'):
            os.rename(path+f, path+f+'.csv')


In [None]:
df = read_csv('B2012-09-20.csv')

In [None]:
print([x for x in df.columns])