In [1]:
import re
import unittest
import os
from typing import Union, Iterable, Dict, List
from datetime import datetime
import dateutil.parser as dparser

In [75]:
def convert_timeformat(time_format : str) -> str:
    # Pattern to find and replace date components
    patterns = {
        r'%Y{2,4}': '%Y',  # Year
        r'%m{1,2}': '%m',  # Month
        r'%d{1,2}': '%d',  # Day
        r'%H{1,2}': '%H',  # Hour
        r'%M{1,2}': '%M',  # Minute
        r'%S{1,2}': '%S'   # Second
    }
    
    # Perform the replacements
    for pattern, replacement in patterns.items():
        time_format = re.sub(pattern, replacement, time_format)
    
    return time_format

def create_regex_from_format(format_string):
    # Define the mapping from format directive to regex pattern
    format_mappings = {
        "%Y": r"(?P<Y>\d{4})",
        "%m": r"(?P<m>\d{2})",
        "%d": r"(?P<d>\d{2})",
        "%H": r"(?P<H>\d{2})",
        "%M": r"(?P<M>\d{2})",
        "%S": r"(?P<S>\d{2})"
    }
    
    # Escape characters that are not format directives
    escaped_format = re.escape(format_string)
    
    # Replace format directives with corresponding regex patterns
    for directive, pattern in format_mappings.items():
        escaped_format = escaped_format.replace(re.escape(directive), pattern)
    
    return escaped_format

def parse_time_string(time_string, format_string):
    # Create regex pattern from format string
    format_string = convert_timeformat(format_string)
    regex_pattern = create_regex_from_format(format_string)
    
    # Match the time string with the regex pattern
    match = re.search(regex_pattern, time_string)
    if match:
        time_elements = match.groupdict()
        print(time_elements, match.group(0))
        try:
            return datetime.strptime(match.group(0), format_string)
        except ValueError as e:
            raise ValueError(f"Time string '{time_string}' does not match format '{format_string}'")
    
    raise ValueError(f"Time string '{time_string}' does not match the pattern derived from format '{format_string}'")

def search_and_convert_timestamp(regex_pattern, image_path, from_format, to_format):
    match = re.search(regex_pattern, image_path)
    if match:
        original_datetime = datetime.strptime(match.group(0), from_format)
        target_datetime_str = original_datetime.strftime(to_format)
        return target_datetime_str
    else:
        raise ValueError(f"Time string '{image_path}' does not match the pattern derived from format '{from_format}'")

# Time stamp extraction
def get_timestamp(image_path : Union[str, Iterable[str]], time_format : str, default_time_format : str = "%Y-%m-%d %H:%M:%S") -> str:
    if not isinstance(image_path, str) and not all(map(lambda p : isinstance(p, str), image_path)):
        raise TypeError(f"`image_path` must be a string, got {type(image_path)}")
    if not isinstance(time_format, str):
        raise TypeError(f"`time_format` must be a string, got {type(time_format)}")

    time_format =convert_timeformat(time_format)
    regex_pattern = create_regex_from_format(time_format)

    if isinstance(image_path, str):
        return search_and_convert_timestamp(regex_pattern, image_path, time_format, default_time_format)
    elif isinstance(image_path, (list, tuple)):
        return [search_and_convert_timestamp(regex_pattern, p, time_format, default_time_format) for p in image_path]
    else:
        raise TypeError(f"`image_path` must be a string, list or tuple, got {type(image_path)}")


def format_timestamp(time_parts : str, time_format : str = "{}-{}-{} {}:{}:{}") -> str:
    return time_format.format(*time_parts)

# Example usage
time_format = "-%m-%d:%H-%M-%S-%YY"
time_strings = [
    "i2023-07-08:12-30-45",
    "2024-01-01_00-00-00",
    "2024-01-01:00-00-00foiehfoern"
    "-01-01:00-00-00-2024 foiehfoern"
]

for ts in time_strings:
    try:
        # dt = parse_time_string(ts, time_format)
        dt = get_timestamp(ts, time_format)
        print(f"Parsed datetime: {dt}, Format used: {time_format}")
    except ValueError as e:
        print(e)


try:
    dt = get_timestamp(time_strings, time_format)
except ValueError as e:
        print(e)
time_format

Time string 'i2023-07-08:12-30-45' does not match the pattern derived from format '-%m-%d:%H-%M-%S-%YY'
Time string '2024-01-01_00-00-00' does not match the pattern derived from format '-%m-%d:%H-%M-%S-%YY'
Parsed datetime: 2024-01-01 00:00:00, Format used: -%m-%d:%H-%M-%S-%YY
Time string 'i2023-07-08:12-30-45' does not match the pattern derived from format '-%m-%d:%H-%M-%S-%YY'


'-%m-%d:%H-%M-%S-%YY'

In [23]:
def parse_time_string_with_format(time_string, format_string):
    # Create regex pattern from format string
    regex_pattern = create_regex_from_format(format_string)
    
    # Match the time string with the regex pattern
    match = re.search(regex_pattern, time_string)
    if match:
        matched_string = match.group(0)  # Get the entire matched substring
        print(matched_string)
        try:
            return datetime.strptime(matched_string, format_string)
        except ValueError as e:
            raise ValueError(f"Time string '{time_string}' does not match format '{format_string}'")
    
    raise ValueError(f"No valid datetime found in '{time_string}' matching the pattern derived from format '{format_string}'")

def create_regex_from_format(format_string):
    # Define the mapping from format directive to regex pattern
    format_mappings = {
        "%Y": r"\d{4}",
        "%m": r"\d{2}",
        "%d": r"\d{2}",
        "%H": r"\d{2}",
        "%M": r"\d{2}",
        "%S": r"\d{2}"
    }
    
    # Escape characters that are not format directives
    escaped_format = re.escape(format_string)
    
    # Replace format directives with corresponding regex patterns
    for directive, pattern in format_mappings.items():
        escaped_format = escaped_format.replace(re.escape(directive), pattern)
    
    # Match any character before and after the datetime pattern
    regex_pattern = rf".*({escaped_format}).*"
    
    return regex_pattern

# Example usage
time_format = "%Y-%m-%d_%H-%M-%S"
time_strings = [
    "Prefix-2023-07-08_12-30-45-Suffix",
    "2024-01-01_00-00-00-with_prefix",
    "Some_text_2023-07-08_12-30-45_More_text",
    "2024-01-01_00-00-00"
]

for ts in time_strings:
    try:
        dt = parse_time_string_with_format(ts, time_format)
        print(f"Parsed datetime: {dt}, Format used: {time_format}")
    except ValueError as e:
        print(e)

Prefix-2023-07-08_12-30-45-Suffix
Time string 'Prefix-2023-07-08_12-30-45-Suffix' does not match format '%Y-%m-%d_%H-%M-%S'
2024-01-01_00-00-00-with_prefix
Time string '2024-01-01_00-00-00-with_prefix' does not match format '%Y-%m-%d_%H-%M-%S'
Some_text_2023-07-08_12-30-45_More_text
Time string 'Some_text_2023-07-08_12-30-45_More_text' does not match format '%Y-%m-%d_%H-%M-%S'
2024-01-01_00-00-00
Parsed datetime: 2024-01-01 00:00:00, Format used: %Y-%m-%d_%H-%M-%S


In [12]:
def get_timestamp(image_path: Union[str, Iterable[str]], time_format: str) -> Union[str, List[str]]:
    if not isinstance(image_path, str) and not all(isinstance(p, str) for p in image_path):
        raise TypeError(f"`image_path` must be a string or an iterable of strings, got {type(image_path)}")
    if not isinstance(time_format, str):
        raise TypeError(f"`time_format` must be a string, got {type(time_format)}")
    
    # Extract format specifiers
    time_regex_parts = re.findall(r'%[a-zA-Z]', time_format)
    
    # Map format specifiers to their lengths
    spec_lengths = {
        '%Y': 4, '%m': 2, '%d': 2,
        '%H': 2, '%M': 2, '%S': 2
    }
    
    # Create the regex pattern to extract the timestamp components
    time_regex_format = time_format
    for spec, length in spec_lengths.items():
        time_regex_format = time_regex_format.replace(spec, fr'(\d{{{length}}})')
    
    predefined_order = {"%Y": 0, "%m": 1, "%d": 2, "%H": 3, "%M": 4, "%S": 5}
    defaults = {"%Y": "0000", "%m": "00", "%d": "00", "%H": "00", "%M": "00", "%S": "00"}
    reorder = {i: predefined_order[k] for i, k in enumerate(time_regex_parts)}
    
    if isinstance(image_path, str):
        time_parts = get_matches_in_order(image_path, time_regex_format, reorder, defaults)
        return format_timestamp(time_parts)
    elif isinstance(image_path, (list, tuple)):
        time_parts = [get_matches_in_order(path, time_regex_format, reorder, defaults) for path in image_path]
        return [format_timestamp(parts) for parts in time_parts]
    else:
        raise TypeError(f"`image_path` must be a string, list or tuple, got {type(image_path)}")

def get_matches_in_order(image_path: str, time_regex: str, reorder: Dict[int, int], default_values: Dict[int, str]) -> List[str]:
    if not isinstance(image_path, str):
        raise TypeError(f"`image_path` must be a string, got {type(image_path)}")
    if not isinstance(time_regex, str):
        raise TypeError(f"`time_regex` must be a string, got {type(time_regex)}")
    if not isinstance(reorder, dict):
        raise TypeError(f"`reorder` must be a dictionary, got {type(reorder)}")
    if not isinstance(default_values, dict):
        raise TypeError(f"`default_values` must be a dictionary, got {type(default_values)}")
    
    image_path = os.path.basename(image_path)
    matches = re.search(time_regex, image_path)
    if matches is None:
        raise ValueError(f"No matches found in '{image_path}' using '{time_regex}'")
    
    matches = matches.groups()
    ordered_matches = {reorder[i]: matches[i] for i in range(len(matches))}
    for i, default in default_values.items():
        if i not in ordered_matches:
            ordered_matches[i] = default
    ordered_matches = [ordered_matches[i] for i in range(len(default_values))]
    return ordered_matches

def format_timestamp(time_parts: List[str], time_format: str = "{}-{}-{} {}:{}:{}") -> str:
    return time_format.format(*time_parts)


In [16]:
class TestGetTimestamp(unittest.TestCase):
    def test_basic_format(self):
        self.assertEqual(get_timestamp("image_20230705123045.jpg", "%Y%m%d%H%M%S"), "2023-07-05 12:30:45")
        self.assertEqual(get_timestamp("photo_2021-12-31_23-59-59.png", "%Y-%m-%d_%H-%M-%S"), "2021-12-31 23:59:59")
    
    def test_multiple_paths(self):
        paths = ["image_20230705123045.jpg", "image_20230705123047.jpg"]
        format_str = "%Y%m%d%H%M%S"
        expected = ["2023-07-05 12:30:45", "2023-07-05 12:30:47"]
        self.assertEqual(get_timestamp(paths, format_str), expected)

    def test_edge_cases(self):
        self.assertRaises(ValueError, get_timestamp, "image_no_date.jpg", "%Y-%m-%d")

    def test_invalid_input(self):
        self.assertRaises(TypeError, get_timestamp, 123, "%Y%m%d%H%M%S")
        self.assertRaises(TypeError, get_timestamp, "image_20230705123045.jpg", 123)
        self.assertRaises(TypeError, get_timestamp, ["image_20230705123045.jpg", 123], "%Y%m%d%H%M%S")

unittest.main(argv=['first-arg-is-ignored'], exit=False)

....
----------------------------------------------------------------------
Ran 4 tests in 0.005s

OK


<unittest.main.TestProgram at 0x7efea42edfd0>

In [45]:
def convert_timeformat(time_format : str) -> str:
    # Pattern to find and replace date components
    patterns = {
        r'%Y{2,4}': '%Y',  # Year
        r'%m{1,2}': '%m',  # Month
        r'%d{1,2}': '%d',  # Day
        r'%H{1,2}': '%H',  # Hour
        r'%M{1,2}': '%M',  # Minute
        r'%S{1,2}': '%S'   # Second
    }
    
    # Perform the replacements
    for pattern, replacement in patterns.items():
        time_format = re.sub(pattern, replacement, time_format)
    
    return time_format

def format_timestamp(time_parts : str, time_format : str = "{}-{}-{} {}:{}:{}") -> str:
    return time_format.format(*time_parts)

def get_timestamp(image_path : Union[str, Iterable[str]], time_format : str) -> str:
    if not isinstance(image_path, str) and not all(map(lambda p : isinstance(p, str), image_path)):
        raise TypeError(f"`image_path` must be a string, got {type(image_path)}")
    if not isinstance(time_format, str):
        raise TypeError(f"`time_format` must be a string, got {type(time_format)}")

    time_format = convert_timeformat(time_format)
    time_regex_parts = re.findall(r'(%([a-zA-Z])+)', time_format)
    time_regex_parts = {s : len(part) - 1 for part, s in time_regex_parts}
    # time_regex_parts = {"Y" : 4, "m" : 2, "d" : 2, "H" : 2, "M" : 2, "S" : 2}
    time_regex = create_regex_from_format(time_format)
    print(time)
    predefined_order = {"Y" : 0, "m" : 1, "d" : 2, "H" : 3, "M" : 4, "S" : 5}
    defaults = {"Y" : "0000", "m" : "00", "d" : "00", "H" : "00", "M" : "00", "S" : "00"}
    defaults = {predefined_order[k] : v for k, v in defaults.items()}
    reorder = {i : predefined_order[k] for i, k in enumerate(time_regex_parts.keys())}
    print(reorder)

    if isinstance(image_path, str):
        time_parts = get_matches_in_order(image_path, time_regex, reorder, defaults)
        return format_timestamp(time_parts)
    elif isinstance(image_path, (list, tuple)):
        time_parts = [get_matches_in_order(path, time_regex, reorder, defaults) for path in image_path]
        return [format_timestamp(parts) for parts in time_parts]
    else:
        raise TypeError(f"`image_path` must be a string, list or tuple, got {type(image_path)}")
    
def get_matches_in_order(image_path : str, time_regex : str, reorder : Dict[int, int], default_values : Dict[int, str]) -> str:
    if not isinstance(image_path, str):
        raise TypeError(f"`image_path` must be a string, got {type(image_path)}")
    if not isinstance(time_regex, str):
        raise TypeError(f"`time_regex` must be a string, got {type(time_regex)}")
    if not isinstance(reorder, dict):
        raise TypeError(f"`reorder` must be a dictionary, got {type(reorder)}")
    if not isinstance(default_values, dict):
        raise TypeError(f"`default_values` must be a dictionary, got {type(default_values)}")
    image_path = os.path.basename(image_path)
    matches = re.search(time_regex, image_path)
    if matches is None:
        raise ValueError(f"No matches found in '{image_path}' using '{time_regex}'")
    matches = matches.groups()
    matches = {i : matches[i] for i in reorder.values()}
    for i, default in default_values.items():
        if i not in matches:
            matches[i] = default
    matches = [matches[i] for i in range(len(matches))]
    return matches



get_timestamp("photo_2021-12-31_23-59-59.png", "%YY-%m-%d_%H-%M-%S"), get_timestamp("photo_-12-31_23-59-59-2021.png", "-%m-%d_%H-%M-%S-%Y")

NameError: name 'time' is not defined

In [36]:
convert_timeformat("-%mm-%d_%H-%M-%S-%YYYY")

'-%m-%d_%H-%M-%S-%Y'

In [29]:
datetime.strptime("photo_2021-12-31_23-59-59.png", "%Y-%m-%d_%H-%M-%S")

ValueError: time data 'photo_2021-12-31_23-59-59.png' does not match format '%Y-%m-%d_%H-%M-%S'

In [2]:
time_format="%Y%m%d%H%M%S"

In [5]:
time_regex_parts = re.findall(r'(%[a-zA-Z])', time_format)
time_regex_parts

['%Y', '%m', '%d', '%H', '%M', '%S']

In [18]:
pattern = r"(?<!\d)\d{14}(?!\d)"

string = "metadata_crop_202404082345491-00-07_CROPNUMBER_3_UUID_ChangeThisTEMPORARY_UUID_ChangeThisTEMPORARY"

search = re.search(pattern=pattern, string=string)
string[search.start():search.end()]

AttributeError: 'NoneType' object has no attribute 'start'

In [13]:
len("20240408234549")

14

In [15]:

text = "Here are some numbers: _12345678901234-, 5678901234567890, and 09876543210987."

# Find all 14-digit sequences
matches = re.findall(r'\b\d{14}\b', text)

print(matches)  # Output: ['12345678901234', '09876543210987']

['09876543210987']
