In [11]:
import pandas as pd
import re
import datetime

In [13]:
date = pd.read_csv("date_parser_testcases.csv")
date.head()

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021


In [14]:
def extract_date(text):
    # Define various date patterns
    month_mapping = {
        'January': '01', 'February': '02', 'March': '03', 'April': '04',
        'May': '05', 'June': '06', 'July': '07', 'August': '08',
        'September': '09', 'October': '10', 'November': '11', 'December': '12',
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
        'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
        'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    month_map = {v: k for k, v in month_mapping.items()}
    date_patterns = [
        r'\b(\d{1,2}/\d{1,2}/\d{4})\b',  # dd/mm/yyyy
        r'\b(\d{1,2}/\d{1,2}/\d{2})\b',  # dd/mm/yy
        r'\b(\d{4}/\d{1,2}/\d{1,2})\b',  # yyyy/mm/dd
        r'\b(\d{4}-\d{2}-\d{2})\b',  # yyyy-mm-dd
        r'\b(\d{1,2}-\d{1,2}-\d{4})\b',  # dd-mm-yyyy
        r'\b(\d{1,2}-\d{1,2}-\d{2})\b',  # dd-mm-yy
        r'\b(\d{1,2}\.\d{1,2}\.\d{4})\b', # dd.mm.yyyy
        r'\b(\d{1,2}\.\d{1,2}\.\d{2})\b', # dd.mm.yy
        r'\b(\d{4}\.\d{1,2}\.\d{1,2})\b', # yyyy.mm.dd
        r'(\d{1,2})(?:st|nd|rd|th)? (\w+) (\d{4})',  # e.g., 5th March 2023
        r'(\d{1,2})(?:st|nd|rd|th)? of (\w+) (\d{4})',  # e.g., 5th of March 2023
        r'(\w+) (\d{1,2}), (\d{4})',  # e.g., March 5, 2023
        r'(\w+) (\d{1,2})(?:st|nd|rd|th)?, (\d{4})',  # e.g., March 5th, 2023
        r'(\d{1,2})(?:st|nd|rd|th)? (\w{3}) (\d{4})',  # e.g., 5th Mar 2023    
    ]
    
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            try:
                # Handle matched date strings based on detected pattern
                date_str = match.group(0)
                # Handle different patterns
                if '/' in date_str:  # dd/mm/yyyy, dd/mm/yy, or yyyy/mm/dd
                    day, month, year = date_str.split('/')
                
                    if int(day) < 32 and int(month) < 13: 
                        if int(year) > 1500:  # dd/mm/yyyy
                            date_obj = datetime.datetime.strptime(date_str, '%d/%m/%Y')
                        else:  # dd/mm/yy
                            date_obj = datetime.datetime.strptime(date_str, '%d/%m/%y')
                    
                    elif int(day) < 13 and int(month) > 12 and int(year) > 1500:
                        month, day, year = date_str.split('/')
                        date_obj = datetime.datetime.strptime(date_str, '%m/%d/%Y')
                    
                    else: 
                        year, month, day = date_str.split('/') # yyyy/mm/dd
                        date_obj = datetime.datetime.strptime(date_str, '%Y/%m/%d')
                
                elif '-' in date_str:  # dd-mm-yyyy, dd-mm-yy, or yyyy-mm-dd
                    day, month, year = date_str.split('-')
                    
                    if int(day) < 32 and int(month) < 13: 
                        if int(year) > 1500:  # dd-mm-yyyy
                            date_obj = datetime.datetime.strptime(date_str, '%d-%m-%Y')
                        else:  # dd-mm-yy
                            date_obj = datetime.datetime.strptime(date_str, '%d-%m-%y')
                    
                    elif int(day) < 13 and int(month) > 12 and int(year) > 1500:
                        month, day, year = date_str.split('-')
                        date_obj = datetime.datetime.strptime(date_str, '%m-%d-%Y')
                    
                    else: 
                        year, month, day = date_str.split('-') # yyyy-mm-dd
                        date_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d')
                
                elif '.' in date_str:  # dd.mm.yyyy, dd.mm.yy, or yyyy.mm.dd
                    day, month, year = date_str.split('.')
                    if int(day) < 32 and int(month) < 13: 
                        if int(year) > 1500:  # dd.mm.yyyy
                            date_obj = datetime.datetime.strptime(date_str, '%d.%m.%Y')
                        else:  # dd.mm.yy
                            date_obj = datetime.datetime.strptime(date_str, '%d.%m.%y')
                    elif int(day) < 13 and int(month) > 12 and int(year) > 1500:
                        month, day, year = date_str.split('.')
                        date_obj = datetime.datetime.strptime(date_str, '%m.%d.%Y')
                    else: 
                        year, month, day = date_str.split('.') # yyyy.mm.dd
                        date_obj = datetime.datetime.strptime(date_str, '%Y.%m.%d')
                
                else:  # Textual format
                    day, month_str, year = match.groups()
                    month_str = month_str.capitalize()  # Ensure month names are properly capitalized
                    month = month_mapping.get(month_str)
                    day = re.sub(r'(st|nd|rd|th)', '', day).strip()  # Remove ordinal suffixes
                    
                    # Check if day and month are valid
                    if int(day) < 32 and month:
                        date_str = f"{day.zfill(2)}/{month}/{year}"
                        date_obj = datetime.datetime.strptime(date_str, '%d/%m/%Y')

                if 'date_obj' in locals():  # Ensure date_obj is defined
                    return date_obj.strftime('%d/%m/%y')
            except ValueError:
                continue  # Handle invalid date formats

    # Additional logic if no date was extracted
    year_match = re.search(r'\b(20\d{2})\b', text)
    year = year_match.group(1) if year_match else None

    month_match = re.search(r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\b', text)
    month = month_mapping[month_match.group(0)] if month_match else None

    date_match = re.search(r'\b(\d{1,2})(?:st|nd|rd|th)?\b', text)
    day = date_match.group(1).zfill(2) if date_match else None

    if day and month and year:
        return f"{day}/{month}/{year[2:]}"
    
    return None


date['Extracted Date'] = date['Input'].apply(extract_date)

date.head(20)


Unnamed: 0,Input,Expected Output,Extracted Date
0,"The event will take place on March 5, 2023.",05/03/2023,05/03/23
1,Her birthday is on 07/08/1990.,07/08/1990,07/08/90
2,The deadline is 2022-12-31.,31/12/2022,31/12/22
3,We met on 1st of January 2000.,01/01/2000,01/01/00
4,"The concert is scheduled for 15th September, 2...",15/09/2021,15/09/21
5,Let's catch up on 02.04.2022.,02/04/2022,02/04/22
6,The project started on 5/6/19.,05/06/2019,05/06/19
7,He was born on 1987/11/23.,23/11/1987,23/11/87
8,Christmas is on 25th Dec 2024.,25/12/2024,25/12/24
9,"The meeting is set for April 03, 2020.",03/04/2020,03/04/20


In [15]:
rows = date[date['Extracted Date'].isna()]
print(rows)

    Input  Expected Output Extracted Date
50  Input  Expected Output           None


In [16]:
date = date.drop(50).reset_index(drop=True)
date['Extracted Date'].isna().sum()

0

In [17]:
date['Match'] = date['Expected Output'] == date['Extracted Date']
date.head(20)

Unnamed: 0,Input,Expected Output,Extracted Date,Match
0,"The event will take place on March 5, 2023.",05/03/2023,05/03/23,False
1,Her birthday is on 07/08/1990.,07/08/1990,07/08/90,False
2,The deadline is 2022-12-31.,31/12/2022,31/12/22,False
3,We met on 1st of January 2000.,01/01/2000,01/01/00,False
4,"The concert is scheduled for 15th September, 2...",15/09/2021,15/09/21,False
5,Let's catch up on 02.04.2022.,02/04/2022,02/04/22,False
6,The project started on 5/6/19.,05/06/2019,05/06/19,False
7,He was born on 1987/11/23.,23/11/1987,23/11/87,False
8,Christmas is on 25th Dec 2024.,25/12/2024,25/12/24,False
9,"The meeting is set for April 03, 2020.",03/04/2020,03/04/20,False


In [18]:
true_count = date['Match'].sum()

# Counting the number of False values
false_count = len(date) - true_count

print("Number of Trues:", true_count)
print("Number of Falses:", false_count)

Number of Trues: 0
Number of Falses: 99
