In [1]:
import pandas as pd
import numpy as np

### generate data

In [2]:
generate_data = False

dates = pd.date_range(start='2024-01-01', end='2024-12-31')
date_formats = {
    'yyyymmdd': '%Y/%m/%d',
    'yyyyddmm': '%Y/%d/%m',
    'mmddyyyy': '%m/%d/%Y',
    'ddmmyyyy': '%d/%m/%Y',
}

if generate_data:
    random_numbers = np.round(np.random.rand(len(dates)),5)

    for name, datetype in date_formats.items():
        df = pd.DataFrame({'dates': dates, 'otherstuff': random_numbers})
        df['dates'] = df['dates'].dt.strftime(datetype)
        df.to_csv(f'data/{name}.csv', index=False)

        with pd.ExcelWriter(f'data/{name}.xlsx', date_format=datetype, datetime_format=datetype) as writer:
            df.to_excel(writer, index=False)

### parse generated data

In [3]:
dfs_csv = dict()
for datetype in date_formats:
    dfs_csv[datetype] = pd.read_csv(f'data/{datetype}.csv')
    # display(dfs_csv[datetype].dtypes)
    # print(datetype)

### dateutil pre pandas date parse

In [4]:
import csv
import contextlib
import openpyxl
import os
import warnings
import xlrd
from dateutil import parser
from datetime import datetime
from openpyxl.utils.datetime import from_excel as datetime_from_excel


def how_many_csv_rows_to_skip(filepath, date_column, row_of_date_column=None):
    num_rows_before_header = 0

    with open(filepath, 'r') as file:
        for row in csv.reader(file):
            if row_of_date_column:
                if row[0] == date_column:
                    break
            else:
                if date_column in row:
                    break
            num_rows_before_header += 1

    return num_rows_before_header


def csv_to_pandas(filepath, date_column, row_of_date_column=None):
    num_rows_to_skip = how_many_csv_rows_to_skip(filepath, date_column, row_of_date_column)
    parsed_data = []

    with open(filepath, 'r') as file:
        reader = csv.reader(file)

        for _ in range(num_rows_to_skip):
            next(reader)
        
        headers = next(reader)
        date_column_index = headers.index(date_column)
        
        for row in reader:
            row[0] = parser.parse(row[date_column_index])
            parsed_data.append(row)

    return pd.DataFrame(parsed_data, columns=headers)


def how_many_xlsx_rows_to_skip(filepath, date_column):
    num_rows_before_header = 0
    wb = openpyxl.load_workbook(filepath, read_only=True)

    sheet = None
    for name in wb.sheetnames:
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower() and 'metadata' not in name.lower():
            sheet = wb[name]
            break

    for row in sheet.iter_rows(values_only=True):
        if date_column in row:
            break
        num_rows_before_header += 1

    wb.close()

    return num_rows_before_header


def xlsx_to_pandas(filepath, date_column):
    num_rows_to_skip = how_many_xlsx_rows_to_skip(filepath, date_column)
    wb = openpyxl.load_workbook(filepath)

    sheet = None
    for name in wb.sheetnames:
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower() and 'metadata' not in name.lower():
            sheet = wb[name]
            break

    for _ in range(num_rows_to_skip):
        next(sheet.iter_rows())

    header_row = next(sheet.iter_rows(min_row=sheet.min_row + num_rows_to_skip, max_row=sheet.min_row + num_rows_to_skip, values_only=True))
    date_column_index = header_row.index(date_column) if date_column in header_row else None

    parsed_data = []
    for row in sheet.iter_rows(min_row=sheet.min_row + num_rows_to_skip + 1, values_only=True):
        row = list(row)  # Convert the tuple to a list for modification

        date_cell = row[date_column_index]
        if isinstance(date_cell, datetime):
            pass
        elif isinstance(date_cell, float):
            row[date_column_index] = datetime_from_excel(date_cell)
        elif date_cell:
            row[date_column_index] = parser.parse(date_cell)
            
        parsed_data.append(row)

    return pd.DataFrame(parsed_data, columns=header_row)


def how_many_xls_rows_to_skip(filepath, date_column):
    num_rows_before_header = 0

    wb = xlrd.open_workbook(filepath)
    sheet = None
    for name in wb.sheet_names():
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower():
            sheet = wb[name]
            break

    for row_idx in range(sheet.nrows):
        row = sheet.row_values(row_idx)
        if date_column in row:
            break
        num_rows_before_header += 1

    return num_rows_before_header


def xls_to_pandas(filepath, date_column):
    num_rows_to_skip = how_many_xls_rows_to_skip(filepath, date_column)

    with open(os.devnull, 'w') as fnull:
        wb = xlrd.open_workbook(filepath)
        wb.logfile = fnull  # Redirect log messages to os.devnull

    sheet = None
    for name in wb.sheet_names():
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower():
            sheet = wb[name]
            break

    for _ in range(num_rows_to_skip):
        next(sheet.iter_rows())

    header_row = None
    for row_idx in range(num_rows_to_skip, sheet.nrows):
        row = sheet.row_values(row_idx)
        if date_column in row:
            header_row = row
            num_rows_to_skip = row_idx
            break

    date_column_index = header_row.index(date_column)

    parsed_data = []
    for row_idx in range(num_rows_to_skip + 1, sheet.nrows):
        row = sheet.row_values(row_idx)
        row[date_column_index] = xlrd.xldate_as_datetime(row[date_column_index], wb.datemode)
        parsed_data.append(row)

    return pd.DataFrame(parsed_data, columns=header_row)


In [5]:
def check_all_imports(skip_xls=False, skip_xlsx=False, skip_csv=False, ignore_list=None):
    n_errors, n_files, n_xls, n_xlsx, n_csv, n_xls_complete, n_xlsx_complete, n_csv_complete  = 0, 0, 0, 0, 0, 0, 0, 0
    error_list = list()

    for root, dirs, files in os.walk('data'):
        for filename in files:
            filepath = os.path.join(root, filename)
            if filepath in ignore_list:
                continue

            extension = filepath.split('.')[-1].lower()
            if skip_xls and extension == 'xls':
                continue
            elif skip_xlsx and extension == 'xlsx':
                continue
            elif skip_csv and extension == 'csv':
                continue
            elif extension not in ['xls', 'xlsx', 'csv']:
                continue

            n_files += 1
            name = filepath.split('.')[0].lower()
            if name[-2:].lower() == 'fr':
                continue
            else:
                try:
                    if extension == 'xls':
                        n_xls += 1
                        xls_to_pandas(filepath, 'Sampling Date')
                        n_xls_complete += 1
                    elif extension == 'xlsx':
                        n_xlsx += 1
                        xlsx_to_pandas(filepath, 'Sampling Date')
                        n_xlsx_complete += 1
                    elif extension == 'csv':
                        n_csv += 1
                        csv_to_pandas(filepath, 'Compounds', 0)
                        n_csv_complete += 1
                except:
                    n_errors += 1
                    error_list.append(filepath)

    print(f'{n_errors} errors / {n_files} total files')
    print(f'xls: {n_xls_complete} out of {n_xls}')
    print(f'csv: {n_csv_complete} out of {n_csv}')
    print(f'xlsx: {n_xlsx_complete} out of {n_xlsx}')

    return error_list

In [6]:
ignore_list = [
    'data\\ddmmyyyy.xlsx', 'data\\mmddyyyy.xlsx', 'data\\yyyyddmm.xlsx', 'data\\yyyymmdd.xlsx',
    'data\\ddmmyyyy.csv', 'data\\mmddyyyy.csv', 'data\\yyyyddmm.csv', 'data\\yyyymmdd.csv',
    'data\\2006\\S62601_VOCS.csv', 'data\\2007\\S62601_VOCS.csv',    # sideways csv for some reason
    'data\\2008\\S90227_VOC.csv', 'data\\2009\\S90227_VOC.csv', 'data\\2010\\S90227_VOC.csv',  # sampling data relocated
]

##### XLSX works (or seems to)

In [7]:
check_all_imports(skip_xls=True, skip_xlsx=False, skip_csv=True, ignore_list=ignore_list)

0 errors / 236 total files
xls: 0 out of 0
csv: 0 out of 0
xlsx: 118 out of 118


[]

##### CSV

In [8]:
csv_errors = check_all_imports(skip_xls=True, skip_xlsx=True, skip_csv=False, ignore_list=ignore_list)

176 errors / 786 total files
xls: 0 out of 0
csv: 452 out of 628
xlsx: 0 out of 0


In [9]:
# check errors to see what's up
first_n = 20
csv_errors[:first_n]

['data\\2014\\S060512_VOC_2014.csv',
 'data\\2014\\S061004_VOC_2014.csv',
 'data\\2014\\S061502_VOC_2014.csv',
 'data\\2014\\S062601_VOC_2014.csv',
 'data\\2014\\S063601_VOC_2014.csv',
 'data\\2014\\S60104_VOC.csv',
 'data\\2014\\S60211_VOC.csv',
 'data\\2014\\S60413_VOC.csv',
 'data\\2014\\S60427_VOC.csv',
 'data\\2014\\S60428_VOC.csv',
 'data\\2014\\S60435_VOC.csv',
 'data\\2014\\S65101_VOC.csv',
 'data\\2015\\S60413_VOC.csv',
 'data\\2015\\S60428_VOC.csv',
 'data\\2015\\S60435_VOC.csv',
 'data\\2015\\S65101_VOC.csv',
 'data\\2016\\S60435_VOC.csv',
 'data\\2016\\S65101_VOC.csv',
 'data\\2017\\S010102_VOC_2017_EN.csv',
 'data\\2017\\S030118_VOC_2017_EN.csv']

In [10]:
pd.read_csv(csv_errors[0])

Unnamed: 0,Sampling Date,NAPS ID,Ethane (ug/m3),Ethylene (ug/m3),Acetylene (ug/m3),Propylene (ug/m3),Propane (ug/m3),1-Propyne (ug/m3),Isobutane (ug/m3),1-Butene/Isobutene (ug/m3),...,Unnamed: 175,Unnamed: 176,Unnamed: 177,Unnamed: 178,Unnamed: 179,Unnamed: 180,Unnamed: 181,Unnamed: 182,Unnamed: 183,Unnamed: 184
0,2014/01/05,60512,7.215,2.093,1.056,0.320,3.645,,1.679,0.259,...,,,,,,,,,,
1,2014/01/11,60512,11.176,2.463,1.313,0.574,6.727,,3.488,0.522,...,,,,,,,,,,
2,2014/01/17,60512,6.651,1.295,0.813,0.309,4.343,,1.190,0.214,...,,,,,,,,,,
3,2014/01/23,60512,4.172,1.444,0.794,0.347,2.496,,1.268,0.224,...,,,,,,,,,,
4,2014/01/29,60512,4.822,1.155,0.749,0.273,3.750,,0.840,0.167,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,2014/12/07,60512,4.265,0.786,0.464,0.134,2.530,,0.862,0.129,...,,,,,,,,,,
57,2014/12/13,60512,10.427,1.451,0.640,0.317,8.170,,1.738,0.164,...,,,,,,,,,,
58,2014/12/19,60512,4.341,0.853,0.466,0.207,2.488,,0.720,0.183,...,,,,,,,,,,
59,2014/12/25,60512,5.959,0.666,0.473,0.161,4.690,,1.037,0.104,...,,,,,,,,,,


#### EDGE CASE: some csv have Sampling Date and/or need cropping

In [11]:
# this file needs to use 'Sampling Date' and to crop useless columns
print(csv_errors[0])
print()
csv_to_pandas(csv_errors[0], 'Sampling Date').replace('', np.nan).dropna(axis=1)

data\2014\S060512_VOC_2014.csv



Unnamed: 0,Sampling Date,NAPS ID,Ethane (ug/m3),Ethylene (ug/m3),Acetylene (ug/m3),Propylene (ug/m3),Propane (ug/m3),1-Propyne (ug/m3),Isobutane (ug/m3),1-Butene/Isobutene (ug/m3),...,Chlorobenzene (ug/m3),Benzylchloride (ug/m3),Bromoform (ug/m3),"1,4-Dichlorobutane (ug/m3)","1,1,2,2-Tetrachloroethane (ug/m3)","1,3-Dichlorobenzene (ug/m3)","1,4-Dichlorobenzene (ug/m3)","1,2-Dichlorobenzene (ug/m3)","1,2,4-Trichlorobenzene (ug/m3)",Hexachlorobutadiene (ug/m3)
0,2014-01-05,060512,7.215,2.093,1.056,0.320,3.645,,1.679,0.259,...,0.000,0.000,0.017,,0.000,0.001,0.037,0.002,0.003,0.002
1,2014-01-11,060512,11.176,2.463,1.313,0.574,6.727,,3.488,0.522,...,0.014,0.000,0.019,,0.000,0.001,0.162,0.002,0.004,0.002
2,2014-01-17,060512,6.651,1.295,0.813,0.309,4.343,,1.190,0.214,...,0.009,0.000,0.017,,0.000,0.001,0.041,0.002,0.003,0.002
3,2014-01-23,060512,4.172,1.444,0.794,0.347,2.496,,1.268,0.224,...,0.008,0.000,0.024,,0.000,0.002,0.024,0.003,0.006,0.000
4,2014-01-29,060512,4.822,1.155,0.749,0.273,3.750,,0.840,0.167,...,0.007,0.000,0.020,,0.000,0.001,0.024,0.002,0.004,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,2014-12-07,060512,4.265,0.786,0.464,0.134,2.530,,0.862,0.129,...,0.008,0.000,0.027,,-999.000,0.002,0.050,0.002,0.004,0.004
57,2014-12-13,060512,10.427,1.451,0.640,0.317,8.170,,1.738,0.164,...,0.008,0.000,0.016,,0.000,0.003,0.045,0.005,0.009,0.006
58,2014-12-19,060512,4.341,0.853,0.466,0.207,2.488,,0.720,0.183,...,0.008,0.000,0.018,,0.000,0.002,0.034,0.004,0.006,0.006
59,2014-12-25,060512,5.959,0.666,0.473,0.161,4.690,,1.037,0.104,...,0.011,0.000,0.018,,0.000,0.002,0.041,0.003,0.005,0.006


In [12]:
# how many errors are this exact bullcrap?

csv_samp_crop, nope = 0, 0
list_of_csv_samp_crop = list()
for file in csv_errors:
    try:
        csv_to_pandas(file, 'Sampling Date').replace('', np.nan).dropna(axis=1)
        csv_samp_crop += 1
        list_of_csv_samp_crop.append(file)
    except:
        nope += 1

csv_samp_crop, nope, len(list_of_csv_samp_crop)

(57, 119, 57)

In [13]:
# TODO: 
#  write edge case
#  fix dis
#  more edge cases

### ugly solution

In [12]:
# relevant imports
import warnings
warnings.filterwarnings("error")

def parse_datetime_format(csv_path, date_column):
    dataframe = pd.read_csv(csv_path)
    dayfirst, yearfirst = False, False
    try:
        pd.to_datetime(dataframe[date_column])
        dayfirst, yearfirst = False, False
    except:
        try:
            pd.to_datetime(dataframe[date_column], dayfirst=True)
            dayfirst, yearfirst = True, False
        except:
            try:
                pd.to_datetime(dataframe[date_column], yearfirst=True)
                dayfirst, yearfirst = False, True
            except:
                pd.to_datetime(dataframe[date_column], dayfirst=True, yearfirst=True)
                dayfirst, yearfirst = True, True
                
    return dayfirst, yearfirst

In [13]:
for csv in ["data/yyyymmdd.csv", "data/yyyyddmm.csv", "data/mmddyyyy.csv", "data/ddmmyyyy.csv"]:
    print(parse_datetime_format(csv, 'dates'))

(False, False)
(True, False)
(False, False)
(True, False)


### dateutil helper function
- doesn't work, but probably could after reading the documentation and a bunch of bullcrap

In [14]:
# relevant imports
from dateutil.parser import parse

def determine_date_format(date_str):
    tokens = date_str.split()
    date_parts = tokens[0].split('-')
    format_str = []

    for part in date_parts:
        if len(part) == 4:
            format_str.append('%Y')
        elif len(part) == 2:
            format_str.append('%m' if int(part) <= 12 else '%d')

    if len(tokens) > 1:
        format_str.append('%H:%M:%S')

    return '-'.join(format_str)

def find_date_format_pattern(file_path):
    df = pd.read_csv(file_path)

    for col in df.columns:
        try:
            # Try to parse the first date in the column
            sample_date = parse(str(df[col].dropna().iloc[0]), fuzzy=True)
            # If successful, assume this is the date column and proceed
            break
        except (ValueError, IndexError):
            continue
    else:
        return "No date column found"

    # Check if all dates in the column match the format
    for date in df[col].dropna():
        try:
            parse(str(date), fuzzy=True)
        except ValueError:
            return f"Dates in column '{col}' do not follow a consistent pattern"

    return determine_date_format(str(sample_date))


In [15]:
file_path = "data/yyyymmdd.csv"
print(find_date_format_pattern(file_path))

%Y-%m-%m-%H:%M:%S


In [16]:
# relevant imports
from dateutil.parser import parse

file_path = "data/yyyymmdd.csv"
df = pd.read_csv(file_path)
for date in df['dates']:
    # find the parse dates format that works for all dates in the entire column
    parse(str(date), fuzzy=True)

In [17]:
def get_date_format(file_path, column_name):
    df = pd.read_csv(file_path)

    def infer_format(date_obj):
        format_parts = []
        if date_obj.year:
            format_parts.append('%Y')
        if date_obj.month:
            format_parts.append('%m')
        if date_obj.day:
            format_parts.append('%d')
        if any([date_obj.hour, date_obj.minute, date_obj.second]):
            format_parts.extend(['%H', '%M', '%S'])
        return '-'.join(format_parts[:3]) + ' ' + ':'.join(format_parts[3:]).strip()

    formats = set()
    for date_str in df[column_name]:
        try:
            date_obj = parse(str(date_str), fuzzy=True)
            format_str = infer_format(date_obj)
            formats.add(format_str)
            if len(formats) > 1:
                return "Inconsistent date formats"
        except ValueError:
            return "Invalid date found"

    return formats.pop() if formats else "No valid dates found"

In [18]:
for file in ["data/yyyymmdd.csv", "data/yyyyddmm.csv", "data/mmddyyyy.csv", "data/ddmmyyyy.csv"]:
    print(file, get_date_format(file, 'dates'))

# no

data/yyyymmdd.csv %Y-%m-%d 
data/yyyyddmm.csv Invalid date found
data/mmddyyyy.csv %Y-%m-%d 
data/ddmmyyyy.csv %Y-%m-%d 


### regex

In [19]:
import re

def date_to_regex(date):
    regex_pattern = date
    regex_pattern = re.sub(r'\b\d{4}\b', r'\\d{4}', regex_pattern)  # Year
    regex_pattern = re.sub(r'\b\d{1,2}\b', r'\\d{1,2}', regex_pattern)  # Month and Day
    return regex_pattern

def find_common_regex_pattern(file_path, column_name):
    df = pd.read_csv(file_path)
    patterns = set(df[column_name].apply(date_to_regex))

    # Simple approach: Find the longest common substring among all patterns
    # This part can be complex based on the diversity of date formats
    common_pattern = min(patterns, key=len)  # Starting with the shortest pattern
    for pattern in patterns:
        if common_pattern not in pattern:
            return "No common pattern found"
    
    return common_pattern


print(find_common_regex_pattern("data/ddmmyyyy.csv", 'dates'))
print(find_common_regex_pattern("data/mmddyyyy.csv", 'dates'))
print(find_common_regex_pattern("data/yyyyddmm.csv", 'dates'))
print(find_common_regex_pattern("data/yyyymmdd.csv", 'dates'))

\d{1,2}/\d{1,2}/\d{\d{1,2}}
\d{1,2}/\d{1,2}/\d{\d{1,2}}
\d{\d{1,2}}/\d{1,2}/\d{1,2}
\d{\d{1,2}}/\d{1,2}/\d{1,2}
