In [1]:
import pandas as pd
import numpy as np

### generate data

In [2]:
dates = pd.date_range(start='2024-01-01', end='2024-12-31')
random_numbers = np.round(np.random.rand(len(dates)),5)
date_formats = {
    'yyyymmdd': '%Y/%m/%d',
    'yyyyddmm': '%Y/%d/%m',
    'mmddyyyy': '%m/%d/%Y',
    'ddmmyyyy': '%d/%m/%Y',
}

for name, datetype in date_formats.items():
    df = pd.DataFrame({'dates': dates, 'otherstuff': random_numbers})
    df['dates'] = df['dates'].dt.strftime(datetype)
    df.to_csv(f'data/{name}.csv', index=False)

    with pd.ExcelWriter(f'data/{name}.xlsx', date_format=datetype, datetime_format=datetype) as writer:
        df.to_excel(writer, index=False)

### parse that data

In [3]:
dfs_csv = dict()
for datetype in date_formats:
    dfs_csv[datetype] = pd.read_csv(f'data/{datetype}.csv')
    # display(dfs_csv[datetype].dtypes)
    # print(datetype)

### dateutil pre pandas date parse

In [47]:
import csv
import openpyxl
import warnings
import xlrd
from dateutil import parser
from datetime import datetime
from openpyxl.utils.datetime import from_excel as datetime_from_excel


def how_many_csv_rows_to_skip(filepath, date_column, row_of_date_column=None):
    num_rows_before_header = 0

    with open(filepath, 'r') as file:
        for row in csv.reader(file):
            if row_of_date_column:
                if row[0] == date_column:
                    break
            else:
                if date_column in row:
                    break
            num_rows_before_header += 1

    return num_rows_before_header


def csv_to_pandas(filepath, date_column, row_of_date_column=None):
    num_rows_to_skip = how_many_csv_rows_to_skip(filepath, date_column, row_of_date_column)
    parsed_data = []

    with open(filepath, 'r') as file:
        reader = csv.reader(file)

        for _ in range(num_rows_to_skip):
            next(reader)
        
        headers = next(reader)
        date_column_index = headers.index(date_column)
        
        for row in reader:
            row[0] = parser.parse(row[date_column_index])
            parsed_data.append(row)

    return pd.DataFrame(parsed_data, columns=headers)


def how_many_xlsx_rows_to_skip(filepath, date_column):
    num_rows_before_header = 0
    wb = openpyxl.load_workbook(filepath, read_only=True)

    sheet = None
    for name in wb.sheetnames:
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower() and 'metadata' not in name.lower():
            sheet = wb[name]
            break

    for row in sheet.iter_rows(values_only=True):
        if date_column in row:
            break
        num_rows_before_header += 1

    wb.close()

    return num_rows_before_header


def xlsx_to_pandas(filepath, date_column):
    num_rows_to_skip = how_many_xlsx_rows_to_skip(filepath, date_column)
    wb = openpyxl.load_workbook(filepath)

    sheet = None
    for name in wb.sheetnames:
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower() and 'metadata' not in name.lower():
            sheet = wb[name]
            break

    for _ in range(num_rows_to_skip):
        next(sheet.iter_rows())

    header_row = next(sheet.iter_rows(min_row=sheet.min_row + num_rows_to_skip, max_row=sheet.min_row + num_rows_to_skip, values_only=True))
    date_column_index = header_row.index(date_column) if date_column in header_row else None

    parsed_data = []
    for row in sheet.iter_rows(min_row=sheet.min_row + num_rows_to_skip + 1, values_only=True):
        row = list(row)  # Convert the tuple to a list for modification

        date_cell = row[date_column_index]
        if isinstance(date_cell, datetime):
            pass
        elif isinstance(date_cell, float):
            row[date_column_index] = datetime_from_excel(date_cell)
        elif date_cell:
            row[date_column_index] = parser.parse(date_cell)
            
        parsed_data.append(row)

    return pd.DataFrame(parsed_data, columns=header_row)


def how_many_xls_rows_to_skip(filepath, date_column):
    num_rows_before_header = 0

    wb = xlrd.open_workbook(filepath)
    sheet = None
    for name in wb.sheet_names():
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower():
            sheet = wb[name]
            break

    for row_idx in range(sheet.nrows):
        row = sheet.row_values(row_idx)
        if date_column in row:
            break
        num_rows_before_header += 1

    return num_rows_before_header



def xls_to_pandas(filepath, date_column, encoding_override='iso-8859-1'):
    num_rows_to_skip = how_many_xls_rows_to_skip(filepath, date_column)
    wb = xlrd.open_workbook(filepath, encoding_override=encoding_override)

    sheet = None
    for name in wb.sheet_names():
        if 'voc' in name.lower():
            sheet = wb[name]
            break
        if 'data' in name.lower():
            sheet = wb[name]
            break

    for _ in range(num_rows_to_skip):
        next(sheet.iter_rows())

    header_row = None
    for row_idx in range(num_rows_to_skip, sheet.nrows):
        row = sheet.row_values(row_idx)
        if date_column in row:
            header_row = row
            num_rows_to_skip = row_idx
            break

    date_column_index = header_row.index(date_column)

    parsed_data = []
    for row_idx in range(num_rows_to_skip + 1, sheet.nrows):
        row = sheet.row_values(row_idx)
        row[date_column_index] = xlrd.xldate_as_datetime(row[date_column_index], wb.datemode)
        parsed_data.append(row)

    return pd.DataFrame(parsed_data, columns=header_row)


In [40]:
how_many_xlsx_rows_to_skip('data/2020/S040203_VOC_2020_EN.xlsx', 'Sampling Date')

8

In [43]:
xlsx_to_pandas('data/2020/S040203_VOC_2020_EN.xlsx', 'Sampling Date').head()

Unnamed: 0,NAPS ID,Sampling Date,Sample Type,Ethylene,Ethylene-MDL,Ethylene-VFlag,Acetylene,Acetylene-MDL,Acetylene-VFlag,Ethane,...,"1,2,4-Trichlorobenzene-VFlag",Naphthalene,Naphthalene-MDL,Naphthalene-VFlag,Dodecane,Dodecane-MDL,Dodecane-VFlag,Hexachlorobutadiene,Hexachlorobutadiene-MDL,Hexachlorobutadiene-VFlag
0,40203,2020-01-04,R,1.865868,0.03,,0.897244,0.01,,6.249422,...,,0.114885,0.07,,0.030088,0.1,,0.004959,0.06,
1,40203,2020-01-10,R,0.974177,0.03,,0.524514,0.01,,4.983609,...,,0.030256,0.07,,0.024327,0.1,,0.003523,0.06,
2,40203,2020-01-16,R,-999.0,-999.0,M1,-999.0,-999.0,M1,-999.0,...,,0.006,0.07,,0.006,0.1,,0.002,0.06,
3,40203,2020-01-22,R,3.704954,0.03,,1.157006,0.01,,4.090592,...,,0.141301,0.07,,0.0345,0.1,,0.004811,0.06,
4,40203,2020-01-28,R,-999.0,-999.0,M1,-999.0,-999.0,M1,-999.0,...,,0.006,0.07,,0.004,0.1,,0.004,0.06,


In [49]:
import os

n_errors, n_files, n_xls, n_xlsx, n_csv = 0, 0, 0, 0, 0
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for root, dirs, files in os.walk('data'):

        list_of_bullshits = list()
        for filename in files:
            n_files += 1
            filepath = os.path.join(root, filename)
            extension = filepath.split('.')[-1].lower()
            name = filepath.split('.')[0].lower()
            if name[-2:].lower() == 'fr':
                continue
            else:
                try:
                    if extension == 'xls':
                        # assert Poo
                        xls_to_pandas(filepath, 'Sampling Date')
                        n_xls += 1
                    elif extension == 'xlsx':
                        # assert Poo
                        xlsx_to_pandas(filepath, 'Sampling Date')
                        n_xlsx += 1
                    else:
                        csv_to_pandas(filepath, 'Compounds', 0)
                        n_csv += 1
                except:
                    n_errors += 1
                    list_of_bullshits.append(filepath)

print(n_errors, n_files, n_xls, n_xlsx, n_csv)

*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: will use 'iso-8859-1'
*** No CODEPAGE record, no encoding_override: wi

In [48]:
xls_to_pandas('data/2017/S040208_VOC_2017_EN.XLS', 'Sampling Date').head()

Unnamed: 0,Sampling Date,NAPS ID,Ethane (ug/m3),Ethylene (ug/m3),Acetylene (ug/m3),Propylene (ug/m3),Propane (ug/m3),1-Propyne (ug/m3),Isobutane (ug/m3),1-Butene/Isobutene (ug/m3),...,Chlorobenzene (ug/m3),Benzylchloride (ug/m3),Bromoform (ug/m3),"1,4-Dichlorobutane (ug/m3)","1,1,2,2-Tetrachloroethane (ug/m3)","1,3-Dichlorobenzene (ug/m3)","1,4-Dichlorobenzene (ug/m3)","1,2-Dichlorobenzene (ug/m3)","1,2,4-Trichlorobenzene (ug/m3)",Hexachlorobutadiene (ug/m3)
0,2017-01-01,40208.0,2.95043,0.606088,0.420629,0.565073,3.741214,,4.044793,0.284199,...,0.009577,0.0,0.023816,,0.0,0.001195,0.005601,0.001774,0.00212,0.00109
1,2017-01-04,40208.0,2.893582,0.410445,0.333625,1.308298,4.864386,,6.613508,0.275564,...,0.009855,0.0,0.028697,,0.0,0.005166,0.006942,0.006777,0.019944,0.006841
2,2017-01-07,40208.0,2.167273,0.677312,0.495787,0.180252,1.817414,,0.575579,0.096385,...,0.009039,0.00159,0.02471,,0.001594,0.002518,0.005408,0.003526,0.008632,0.003423
3,2017-01-10,40208.0,3.24373,1.859604,0.839982,0.742873,4.153393,,8.230397,3.333283,...,0.007848,0.0,0.028303,,0.0,0.002747,0.007298,0.00307,0.006603,0.001875
4,2017-01-13,40208.0,4.198657,0.892718,0.373474,0.936348,5.91678,,11.589669,2.272418,...,0.008777,0.0,0.021763,,0.0,0.002045,0.006647,0.002395,0.004134,0.001317


In [15]:
csv_to_pandas(r'data\2005\S100111_CSV.csv', 'Compounds', 0).head()

Unnamed: 0,Compounds,Ethane,Ethylene,Acetylene,Propylene,Propane,1-Propyne,Isobutane,1-Butene/Isobutene,"1,3-Butadiene",...,Hexanal,"2,5-Dimethylbenzaldehyde",Unnamed: 14,Sample ID#,Sample Date,Canister ID#,Sample Volume,NAPS ID,START TIME,DURATION
0,2005-01-04,5.995573093,8.586041239,5.415266001,3.5576,16.2875,0.3977,14.7718,2.5781,0.6067,...,,,,va37y.d,1/4/05,EPS 019,500,100111,00:00,24
1,2005-01-16,4.748603311,5.727944894,3.326608925,2.1433,7.7279,0.2335,8.2115,1.5817,0.3761,...,,,,va50y.d,1/16/05,EPS 332,500,100111,00:00,24
2,2005-01-22,1.675851128,1.618460354,1.476540408,0.6944,2.897,0.0703,2.4787,0.4901,0.1079,...,,,,va51y.d,1/22/05,EPS 193,500,100111,00:00,24
3,2005-01-28,3.740149222,4.561265906,3.343004628,1.6539,8.8572,0.1614,8.3509,1.123,0.2444,...,,,,va69y.d,1/28/05,EPS 385,500,100111,00:00,24
4,2005-02-09,4.208300873,5.750098427,4.190533058,2.3232,11.664,0.2408,13.0853,1.9264,0.3362,...,,,,va70y.d,2/9/05,EPS 092,500,100111,00:00,24


In [16]:
csv_to_pandas('data\ddmmyyyy.csv', 'dates', 0).head()

Unnamed: 0,dates,otherstuff
0,2024-01-01,0.7296
1,2024-02-01,0.52499
2,2024-03-01,0.41871
3,2024-04-01,0.28152
4,2024-05-01,0.5455


In [10]:
filepath = os.path.join(root, filename)
filepath.split('.')[-1]

'xlsx'

### ugly solution

In [32]:
# relevant imports
import warnings
warnings.filterwarnings("error")

def parse_datetime_format(csv_path, date_column):
    dataframe = pd.read_csv(csv_path)
    dayfirst, yearfirst = False, False
    try:
        pd.to_datetime(dataframe[date_column])
        dayfirst, yearfirst = False, False
    except:
        try:
            pd.to_datetime(dataframe[date_column], dayfirst=True)
            dayfirst, yearfirst = True, False
        except:
            try:
                pd.to_datetime(dataframe[date_column], yearfirst=True)
                dayfirst, yearfirst = False, True
            except:
                pd.to_datetime(dataframe[date_column], dayfirst=True, yearfirst=True)
                dayfirst, yearfirst = True, True
                
    return dayfirst, yearfirst

In [35]:
for csv in ["data/yyyymmdd.csv", "data/yyyyddmm.csv", "data/mmddyyyy.csv", "data/ddmmyyyy.csv"]:
    print(parse_datetime_format(csv, 'dates'))

(False, False)
(True, False)
(False, False)
(True, False)


### dateutil helper function
- doesn't work, but probably could after reading the documentation and a bunch of bullcrap

In [53]:
# relevant imports
from dateutil.parser import parse

def determine_date_format(date_str):
    tokens = date_str.split()
    date_parts = tokens[0].split('-')
    format_str = []

    for part in date_parts:
        if len(part) == 4:
            format_str.append('%Y')
        elif len(part) == 2:
            format_str.append('%m' if int(part) <= 12 else '%d')

    if len(tokens) > 1:
        format_str.append('%H:%M:%S')

    return '-'.join(format_str)

def find_date_format_pattern(file_path):
    df = pd.read_csv(file_path)

    for col in df.columns:
        try:
            # Try to parse the first date in the column
            sample_date = parse(str(df[col].dropna().iloc[0]), fuzzy=True)
            # If successful, assume this is the date column and proceed
            break
        except (ValueError, IndexError):
            continue
    else:
        return "No date column found"

    # Check if all dates in the column match the format
    for date in df[col].dropna():
        try:
            parse(str(date), fuzzy=True)
        except ValueError:
            return f"Dates in column '{col}' do not follow a consistent pattern"

    return determine_date_format(str(sample_date))


In [54]:
file_path = "data/yyyymmdd.csv"
print(find_date_format_pattern(file_path))

%Y-%m-%m-%H:%M:%S


In [56]:
# relevant imports
from dateutil.parser import parse

file_path = "data/yyyymmdd.csv"
df = pd.read_csv(file_path)
for date in df['dates']:
    # find the parse dates format that works for all dates in the entire column
    parse(str(date), fuzzy=True)

In [61]:
def get_date_format(file_path, column_name):
    df = pd.read_csv(file_path)

    def infer_format(date_obj):
        format_parts = []
        if date_obj.year:
            format_parts.append('%Y')
        if date_obj.month:
            format_parts.append('%m')
        if date_obj.day:
            format_parts.append('%d')
        if any([date_obj.hour, date_obj.minute, date_obj.second]):
            format_parts.extend(['%H', '%M', '%S'])
        return '-'.join(format_parts[:3]) + ' ' + ':'.join(format_parts[3:]).strip()

    formats = set()
    for date_str in df[column_name]:
        try:
            date_obj = parse(str(date_str), fuzzy=True)
            format_str = infer_format(date_obj)
            formats.add(format_str)
            if len(formats) > 1:
                return "Inconsistent date formats"
        except ValueError:
            return "Invalid date found"

    return formats.pop() if formats else "No valid dates found"

In [62]:
for file in ["data/yyyymmdd.csv", "data/yyyyddmm.csv", "data/mmddyyyy.csv", "data/ddmmyyyy.csv"]:
    print(file, get_date_format(file, 'dates'))

# no

data/yyyymmdd.csv %Y-%m-%d 
data/yyyyddmm.csv Invalid date found
data/mmddyyyy.csv %Y-%m-%d 
data/ddmmyyyy.csv %Y-%m-%d 


### regex

In [64]:
import re

def date_to_regex(date):
    regex_pattern = date
    regex_pattern = re.sub(r'\b\d{4}\b', r'\\d{4}', regex_pattern)  # Year
    regex_pattern = re.sub(r'\b\d{1,2}\b', r'\\d{1,2}', regex_pattern)  # Month and Day
    return regex_pattern

def find_common_regex_pattern(file_path, column_name):
    df = pd.read_csv(file_path)
    patterns = set(df[column_name].apply(date_to_regex))

    # Simple approach: Find the longest common substring among all patterns
    # This part can be complex based on the diversity of date formats
    common_pattern = min(patterns, key=len)  # Starting with the shortest pattern
    for pattern in patterns:
        if common_pattern not in pattern:
            return "No common pattern found"
    
    return common_pattern


print(find_common_regex_pattern("data/ddmmyyyy.csv", 'dates'))
print(find_common_regex_pattern("data/mmddyyyy.csv", 'dates'))
print(find_common_regex_pattern("data/yyyyddmm.csv", 'dates'))
print(find_common_regex_pattern("data/yyyymmdd.csv", 'dates'))

\d{1,2}/\d{1,2}/\d{\d{1,2}}
\d{1,2}/\d{1,2}/\d{\d{1,2}}
\d{\d{1,2}}/\d{1,2}/\d{1,2}
\d{\d{1,2}}/\d{1,2}/\d{1,2}
