In [26]:
import re
import numpy as np
import pandas as pd
from functools import reduce

doc = []
with open("data/dates.txt") as file:
    for line in file:
        doc.append(line)

df = pd.Series(doc)
df.head()

0         03/25/93 Total time of visit (in minutes):\n
1                       6/18/85 Primary Care Doctor:\n
2    sshe plans to move as of 7/8/71 In-Home Servic...
3                7 on 9/27/75 Audit C Score Current:\n
4    2/6/96 sleep studyPain Treatment Pain Level (N...
dtype: object

In [88]:
### Case 1
def matcher(df):
    # For later use
    months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    mon2int = {m:(i + 1) for (i, m) in enumerate(months)}

    ########## RULES ##########
    # 04/20/2009; 04/20/09; 4/20/09; 4/3/09
    rule1 = r"(?P<month>\d{1,2})[/-](?P<day>\d{1,2})[/-](?P<year>\d{2,4})"
    # Mar-20-2009; Mar 20, 2009; March 20, 2009; Mar. 20, 2009; Mar 20 2009
    rule2 = r"(?P<month>jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec\w*)[-]?[ -](?P<day>\d{1,2})[,]?[ -](?P<year>\d{4})"
    # 20 Mar 2009; 20 March 2009; 20 Mar. 2009; 20 March, 2009
    rule3 = r"(?P<day>\d{1,2}) (?P<month>jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec\w*)[.]?[ -](?P<year>\d{4})"
    # Feb 2009; Sep 2009; Oct 2010
    rule4 = r"(?P<day>)(?P<month>jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec\w*) (?P<year>\d{4})"
    # 6/2008; 12/2009
    rule5 = r"(?P<day>)(?P<month>\d{1,2})/(?P<year>\d{4})"
    # 2009; 2010
    rule6 = r"(?P<day>)(?P<month>)(?P<year>\d{4})"

    rules = [rule1, rule2, rule3, rule4, rule5, rule6]

    ########## MATCHING ##########
    extract_df_list = [df.str.lower().str.extract(rule) for rule in rules]
    final_df = reduce(lambda x, y: x.fillna(y), extract_df_list)

    # Adding original column to final df
    final_df["text"] = df

    ########## EXTRA PROCESSING ##########
    # Map months
    final_df["month"] = final_df["month"].apply(lambda x: mon2int.get(str(x)[:3], x))
    # Fill NaNs in months
    final_df["month"] = final_df["month"].fillna("1").apply(lambda x: "1" if x == "" else x)
    # Years
    final_df["year"] = final_df["year"].apply(lambda x: "19" + x if len(x) == 2 else x)
    # Days
    final_df["day"] = final_df["day"].fillna("1").apply(lambda x: "1" if x == "" else x)

    final_df = final_df[["year","month","day"]].astype(int)
    index_df = final_df.sort_values(['year','month','day'], axis=0).reset_index()
    index_df = index_df['index']
    return index_df

matcher(df)

0        9
1       84
2        2
3       53
4       28
      ... 
495    141
496    186
497    161
498    413
499    271
Name: index, Length: 500, dtype: int64