
# PART 2 - Practical 3

### Build a date parser using basic text processing and rules. (No ML models)

    Given a piece of text, extract the day, month and year info and present it in DD/MM/ YYYY format.
    Example: "l went to London on 21st June, 2024" 21/06/2024



In [1]:
import pandas as pd
import re
from datetime import datetime

def parse_date(text):
    
    month_dict = {
        'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'may': '05', 'jun': '06',
        'jul': '07', 'aug': '08', 'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'
    }

    patterns = [
        r'\b(\d{1,2})(?:st|nd|rd|th)?\s*(?:of\s+)?([a-zA-Z]{3,9})(?:\s*,?\s*|\s+)(\d{4}|\d{2})\b',
        r'\b([a-zA-Z]{3,9})\s+(\d{1,2})(?:st|nd|rd|th)?(?:\s*,?\s*|\s+)(\d{4}|\d{2})\b',  
        r'\b(\d{4})[-./](\d{1,2})[-./](\d{1,2})\b',  
        r'\b(\d{1,2})[-./](\d{1,2})[-./](\d{4}|\d{2})\b', 
        r'\b(\d{1,2})\.(\d{1,2})\.(\d{4}|\d{2})\b' 
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            groups = match.groups()
            try:
                if len(groups[0]) == 4: 
                    year, month, day = groups
                elif groups[0].isalpha():  # Month first
                    month, day, year = groups
                    month = month_dict[month.lower()[:3]]
                else:  # Day first
                    day, month, year = groups
                    if not month.isdigit():
                        month = month_dict[month.lower()[:3]]

                if len(year) == 2:
                    current_year = datetime.now().year
                    century = str(current_year)[:2]
                    year = century + year

                day = day.zfill(2)
                month = month.zfill(2)

                datetime(int(year), int(month), int(day))

                return f"{day}/{month}/{year}"
            except (ValueError, KeyError):
                continue
    return None

In [2]:
df = pd.read_csv('date_parser_testcases.csv')
df.head()

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021


In [3]:
df['My_Output'] = df['Input'].apply(parse_date)

print(df)

                                                Input Expected Output  \
0         The event will take place on March 5, 2023.      05/03/2023   
1                      Her birthday is on 07/08/1990.      07/08/1990   
2                         The deadline is 2022-12-31.      31/12/2022   
3                      We met on 1st of January 2000.      01/01/2000   
4   The concert is scheduled for 15th September, 2...      15/09/2021   
..                                                ...             ...   
95  We celebrate Independence Day on 2023-07-04, a...      04/07/2023   
96  The final date for submission is 30th November...      30/11/2022   
97  The annual conference is on 15th October 2023,...      15/10/2023   
98  His birthdate, noted as 1990-05-20, is in the ...      20/05/1990   
99  The festival will be celebrated on 12th August...      12/08/2024   

     My_Output  
0   05/03/2023  
1   07/08/1990  
2   31/12/2022  
3   01/01/2000  
4   15/09/2021  
..         ...  
95  

In [4]:
correct_count = (df['My_Output'] == df['Expected Output']).sum()
total_count = len(df)
accuracy = (correct_count / total_count) * 100

accuracy

87.0