In [23]:
import pandas as pd
import numpy as np

### generate data

In [9]:
dates = pd.date_range(start='2024-01-01', end='2024-12-31')
random_numbers = np.round(np.random.rand(len(dates)),5)
date_formats = {
    'yyyymmdd': '%Y/%m/%d',
    'yyyyddmm': '%Y/%d/%m',
    'mmddyyyy': '%m/%d/%Y',
    'ddmmyyyy': '%d/%m/%Y',
}

for name, datetype in date_formats.items():
    df = pd.DataFrame({'dates': dates, 'otherstuff': random_numbers})
    df['dates'] = df['dates'].dt.strftime(datetype)
    df.to_csv(f'data/{name}.csv', index=False)

    with pd.ExcelWriter(f'data/{name}.xlsx', date_format=datetype, datetime_format=datetype) as writer:
        df.to_excel(writer, index=False)

### parse that data

In [10]:
list(date_formats.keys())[0]

'yyyymmdd'

In [41]:
dfs_csv = dict()
for datetype in date_formats:
    dfs_csv[datetype] = pd.read_csv(f'data/{datetype}.csv')
    display(dfs_csv[datetype].dtypes)
    print(datetype)

dates          object
otherstuff    float64
dtype: object

yyyymmdd


dates          object
otherstuff    float64
dtype: object

yyyyddmm


dates          object
otherstuff    float64
dtype: object

mmddyyyy


dates          object
otherstuff    float64
dtype: object

ddmmyyyy


### ugly solution

In [47]:
# relevant imports
import warnings
warnings.filterwarnings("error")

datetype = list(date_formats.keys())[1]
try:
    print('default')
    pd.to_datetime(dfs_csv[datetype]['dates'])
except:
    try:
        print('dayfirst')
        pd.to_datetime(dfs_csv[datetype]['dates'], dayfirst=True)
    except:
        try:
            print('yearfirst')
            pd.to_datetime(dfs_csv[datetype]['dates'], yearfirst=True)
        except:
            print('dayfirst and yearfirst')
            pd.to_datetime(dfs_csv[datetype]['dates'], dayfirst=True, yearfirst=True)

default
dayfirst


### dateutil helper function
- doesn't work, but probably could after reading the documentation and a bunch of bullcrap

In [53]:
# relevant imports
from dateutil.parser import parse

def determine_date_format(date_str):
    tokens = date_str.split()
    date_parts = tokens[0].split('-')
    format_str = []

    for part in date_parts:
        if len(part) == 4:
            format_str.append('%Y')
        elif len(part) == 2:
            format_str.append('%m' if int(part) <= 12 else '%d')

    if len(tokens) > 1:
        format_str.append('%H:%M:%S')

    return '-'.join(format_str)

def find_date_format_pattern(file_path):
    df = pd.read_csv(file_path)

    for col in df.columns:
        try:
            # Try to parse the first date in the column
            sample_date = parse(str(df[col].dropna().iloc[0]), fuzzy=True)
            # If successful, assume this is the date column and proceed
            break
        except (ValueError, IndexError):
            continue
    else:
        return "No date column found"

    # Check if all dates in the column match the format
    for date in df[col].dropna():
        try:
            parse(str(date), fuzzy=True)
        except ValueError:
            return f"Dates in column '{col}' do not follow a consistent pattern"

    return determine_date_format(str(sample_date))


In [54]:
file_path = "data/yyyymmdd.csv"
print(find_date_format_pattern(file_path))

%Y-%m-%m-%H:%M:%S


In [56]:
# relevant imports
from dateutil.parser import parse

file_path = "data/yyyymmdd.csv"
df = pd.read_csv(file_path)
for date in df['dates']:
    # find the parse dates format that works for all dates in the entire column
    parse(str(date), fuzzy=True)

In [61]:
def get_date_format(file_path, column_name):
    df = pd.read_csv(file_path)

    def infer_format(date_obj):
        format_parts = []
        if date_obj.year:
            format_parts.append('%Y')
        if date_obj.month:
            format_parts.append('%m')
        if date_obj.day:
            format_parts.append('%d')
        if any([date_obj.hour, date_obj.minute, date_obj.second]):
            format_parts.extend(['%H', '%M', '%S'])
        return '-'.join(format_parts[:3]) + ' ' + ':'.join(format_parts[3:]).strip()

    formats = set()
    for date_str in df[column_name]:
        try:
            date_obj = parse(str(date_str), fuzzy=True)
            format_str = infer_format(date_obj)
            formats.add(format_str)
            if len(formats) > 1:
                return "Inconsistent date formats"
        except ValueError:
            return "Invalid date found"

    return formats.pop() if formats else "No valid dates found"

In [62]:
for file in ["data/yyyymmdd.csv", "data/yyyyddmm.csv", "data/mmddyyyy.csv", "data/ddmmyyyy.csv"]:
    print(file, get_date_format(file, 'dates'))

# no

data/yyyymmdd.csv %Y-%m-%d 
data/yyyyddmm.csv Invalid date found
data/mmddyyyy.csv %Y-%m-%d 
data/ddmmyyyy.csv %Y-%m-%d 


### regex

In [64]:
import re

def date_to_regex(date):
    regex_pattern = date
    regex_pattern = re.sub(r'\b\d{4}\b', r'\\d{4}', regex_pattern)  # Year
    regex_pattern = re.sub(r'\b\d{1,2}\b', r'\\d{1,2}', regex_pattern)  # Month and Day
    return regex_pattern

def find_common_regex_pattern(file_path, column_name):
    df = pd.read_csv(file_path)
    patterns = set(df[column_name].apply(date_to_regex))

    # Simple approach: Find the longest common substring among all patterns
    # This part can be complex based on the diversity of date formats
    common_pattern = min(patterns, key=len)  # Starting with the shortest pattern
    for pattern in patterns:
        if common_pattern not in pattern:
            return "No common pattern found"
    
    return common_pattern


print(find_common_regex_pattern("data/ddmmyyyy.csv", 'dates'))
print(find_common_regex_pattern("data/mmddyyyy.csv", 'dates'))
print(find_common_regex_pattern("data/yyyyddmm.csv", 'dates'))
print(find_common_regex_pattern("data/yyyymmdd.csv", 'dates'))

\d{1,2}/\d{1,2}/\d{\d{1,2}}
\d{1,2}/\d{1,2}/\d{\d{1,2}}
\d{\d{1,2}}/\d{1,2}/\d{1,2}
\d{\d{1,2}}/\d{1,2}/\d{1,2}
