# Date Mining

## Setup

In [126]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 1000)
import os, re

In [127]:
for d in ['data', 'output']:
    os.makedirs(d, exist_ok=True)

In [128]:
url = "https://setu-datamining2.github.io/live/topics/21-Assignments/03-Mining_Dates/files/public.csv"

if os.path.isfile('data/public.csv'):
    print('Using local copy')
else:
    print('Downloading file')
    df = pd.read_csv(url)
    df.to_csv('data/public.csv', index=False)

Using local copy


## Dataset

In [129]:
df = pd.read_csv('data/public.csv')
print(df.shape)
df.head(10)
# string clean raw column,  because months might not spelt correctly - the column will be called data
# Iter column will have integers. If 0 then it means its not matched, if its 1, 2, 3, etc.. if it matches using regex number 1, 2, 3, etc..

(715, 2)


Unnamed: 0,Code,Raw
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - syn..."
1,1039370009,".April, 5 97: made a phone call to Mom and Mom..."
2,1039574613,A pleasant 28 yo woman with no formal psychiat...
3,1039963589,"October 7, 01 [report_end]"
4,1048901075,"July, 4, 01 Primary Care Doctor:"
5,1054311047,)and 8mo in 2009
6,1054668034,")HTN, hypercholesterolemia, DM, sleep apnea,, ..."
7,1082469285,"Septeber, 10, 70 CPT Code: 90792: With medical..."
8,1125769793,"Since 10/2014: Fatigued, more forgetful, impai..."
9,1148116416,24 yo right handed woman with history of large...


## Import / Clean

Algorithm
* Clean data first to reduce number of regexes needed
* Build sequence of regexes (starting with the most restrictive)
* For each regex numbered, 1, 2, 3, ..
    * Find rows that match
    * Extract day, month and year
    * Update df
    

In [130]:
def clean(s):
    return s

In [131]:
df['Data'] = df.Raw.apply(clean)
df['Iter'] = 0
df['Day'] = 0
df['Month'] = 0
df['Year'] = 0
# Also need Date

### Helper functions

In [132]:
def info():
    """Show rows match and rows no matched."""

    display(df.head(10))

In [133]:
def verify(df_tmp):
    for c in ['Month', 'Day', 'Year']:
        df_tmp[c] = df_tmp[c].astype(int)
    return df_tmp
        # TODO

### `mm/dd/yyyy`

In [134]:
tmp = df.loc[df.Code == 1148116416, "Data"]
display(tmp)

regex = r"(\d{1,2})/(\d{1,2})/(\d{2,4})"

tmp.str.extract(regex)

9    24 yo right handed woman with history of large...
Name: Data, dtype: object

Unnamed: 0,0,1,2
9,11,3,1985


In [135]:
# Apply

iter = 0
columns = ['Month', 'Day', 'Year']

df_tmp = df[df.Iter==0].Data.str.extract(regex)
df_tmp.dropna(inplace=True)
df_tmp.columns = columns
print(f"Number of rows matched = {df_tmp.shape[0]}")
df_tmp

Number of rows matched = 120


Unnamed: 0,Month,Day,Year
9,11,3,1985
10,4,19,91
14,07,29,1994
16,6,10,72
18,6,18,85
...,...,...,...
689,5,24,88
695,12,26,86
696,10,05,97
698,04,08,2004


In [136]:
# Verify date

df_tmp = verify(df_tmp)

In [137]:
# Save result

criteria = (df.Iter == 0) & (df_tmp.Year>0)

df.loc[criteria, columns] = df_tmp[columns]
df.loc[criteria, 'Iter'] = iter + 1

info()

Unnamed: 0,Code,Raw,Data,Iter,Day,Month,Year
0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - syn...",".12, Noember 16- bad reaction to SpiceK2 - syn...",0,0,0,0
1,1039370009,".April, 5 97: made a phone call to Mom and Mom...",".April, 5 97: made a phone call to Mom and Mom...",0,0,0,0
2,1039574613,A pleasant 28 yo woman with no formal psychiat...,A pleasant 28 yo woman with no formal psychiat...,0,0,0,0
3,1039963589,"October 7, 01 [report_end]","October 7, 01 [report_end]",0,0,0,0
4,1048901075,"July, 4, 01 Primary Care Doctor:","July, 4, 01 Primary Care Doctor:",0,0,0,0
5,1054311047,)and 8mo in 2009,)and 8mo in 2009,0,0,0,0
6,1054668034,")HTN, hypercholesterolemia, DM, sleep apnea,, ...",")HTN, hypercholesterolemia, DM, sleep apnea,, ...",0,0,0,0
7,1082469285,"Septeber, 10, 70 CPT Code: 90792: With medical...","Septeber, 10, 70 CPT Code: 90792: With medical...",0,0,0,0
8,1125769793,"Since 10/2014: Fatigued, more forgetful, impai...","Since 10/2014: Fatigued, more forgetful, impai...",0,0,0,0
9,1148116416,24 yo right handed woman with history of large...,24 yo right handed woman with history of large...,1,3,11,1985
