# the goal is to create a metric to evaluate the date found by our models

In [13]:
import pandas as pd
from ephesus.date import get_data_json, get_dates, get_data_targets_json, get_dates_targets

In [14]:
# get feature sample
sample_size = 30
df_X = get_data_json()
df_X_sample = get_dates(df_X.sample(sample_size).copy())
# get target dates
df_y = get_dates_targets(get_data_targets_json())

In [15]:
df_X_sample["fichier"].head().to_list()

['f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-ce20-421f-a546-ef11f5d30541_translation.json',
 'b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-2ecb-4256-a826-0d28e40904d3_translation.json',
 '9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-df22-4312-bcbe-fd2f32294cee_translation.json',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-e026-4c98-a4a9-5c31c09bc0b4_translation.json',
 '9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-8646-4b45-a1a4-06cdb747f8fc_translation.json']

In [16]:
df_y["fichier"].head().to_list()

['2d035c4b-cdfa-4982-87dc-916fe07a0824_249f1d30-1a50-42de-a3a6-20c295f770c7_extraction.json',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_019a0add-5b4a-4cb9-b999-daeb5e2a54fd_extraction.json',
 '55d674cc-3389-4cf6-ab7c-1f1b9fa1b6ed_02a55241-ccf4-4f9a-87ac-23e0ac71e576_extraction.json',
 '9f980dcf-b431-4e67-876f-2b8e288b7900_1a26bbf2-1192-4139-b6fb-cb7fc1c826ce_extraction.json',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_545d42f0-5eb0-4eec-987e-49386f952502_extraction.json']

In [17]:
def clean_filename(filename):
    return filename[:-17] if "translation.json" in filename else filename[:-16]

print(clean_filename('2d035c4b-cdfa-4982-87dc-916fe07a0824_1275c4e7-526e-4736-8e5b-bc95ae00f890_translation.json'))
print(clean_filename('2d035c4b-cdfa-4982-87dc-916fe07a0824_249f1d30-1a50-42de-a3a6-20c295f770c7_extraction.json'))

2d035c4b-cdfa-4982-87dc-916fe07a0824_1275c4e7-526e-4736-8e5b-bc95ae00f890
2d035c4b-cdfa-4982-87dc-916fe07a0824_249f1d30-1a50-42de-a3a6-20c295f770c7


In [18]:
df_X_sample["filename_cleaned"] = df_X_sample["fichier"].apply(clean_filename)
df_y["filename_cleaned"] = df_y["fichier"].apply(clean_filename)

In [19]:
df_X_sample["filename_cleaned"].head().to_list()

['f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-ce20-421f-a546-ef11f5d30541',
 'b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-2ecb-4256-a826-0d28e40904d3',
 '9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-df22-4312-bcbe-fd2f32294cee',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-e026-4c98-a4a9-5c31c09bc0b4',
 '9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-8646-4b45-a1a4-06cdb747f8fc']

In [20]:
df_y["filename_cleaned"].head().to_list()

['2d035c4b-cdfa-4982-87dc-916fe07a0824_249f1d30-1a50-42de-a3a6-20c295f770c7',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_019a0add-5b4a-4cb9-b999-daeb5e2a54fd',
 '55d674cc-3389-4cf6-ab7c-1f1b9fa1b6ed_02a55241-ccf4-4f9a-87ac-23e0ac71e576',
 '9f980dcf-b431-4e67-876f-2b8e288b7900_1a26bbf2-1192-4139-b6fb-cb7fc1c826ce',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_545d42f0-5eb0-4eec-987e-49386f952502']

In [21]:
# merge
df = df_X_sample.merge(df_y, how="left", on="filename_cleaned")

columns_to_keep = [
    "filename_cleaned",
    "words",
    "words_len",
    "CareBeginDate_day_format01",
    "CareBeginDate_day_format02",
    "CareBeginDate_day_format03",
    "CareBeginDate_month_format01",
    "CareBeginDate_month_format02",
    "CareBeginDate_month_format03",
    "CareBeginDate_year_format01",
    "CareBeginDate_year_format02",
    "CareBeginDate_hour_format01",
    "CareBeginDate_hour_format02",
    "CareBeginDate_hour_format03",
    "CareBeginDate_hour_format04",
    "CareBeginDate_hour_format05",
    "CareBeginDate_hour_format06"
]
df = df[columns_to_keep]
df.head()

Unnamed: 0,filename_cleaned,words,words_len,CareBeginDate_day_format01,CareBeginDate_day_format02,CareBeginDate_day_format03,CareBeginDate_month_format01,CareBeginDate_month_format02,CareBeginDate_month_format03,CareBeginDate_year_format01,CareBeginDate_year_format02,CareBeginDate_hour_format01,CareBeginDate_hour_format02,CareBeginDate_hour_format03,CareBeginDate_hour_format04,CareBeginDate_hour_format05,CareBeginDate_hour_format06
0,f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-...,"[le premier avril, à 8h15., dans 10 jours, 10 ...",4,jeudi,1,1,avril,4,4,21,2021,8h00,8h,8,8h00,8h,8
1,b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-...,[],0,mardi,22,22,mars,3,3,22,2022,7h15,7h,7,7h15,7h,7
2,9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-...,[],0,lundi,24,24,janvier,1,1,22,2022,11h00,11h,11,11h00,11h,11
3,2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-...,"[le 10 juillet, le 10 juillet à 16h]",2,mercredi,10,10,juillet,7,7,19,2019,16h00,16h,16,4h00,4h,4
4,9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-...,[le 25 septembre à 8h],1,mercredi,25,25,septembre,9,9,19,2019,8h30,8h,8,8h30,8h,8


In [23]:
# evaluation : we check if each "CareBeginDate_..." is in one of the dates found in "date_words"

In [24]:
df.head(20)

Unnamed: 0,filename_cleaned,words,words_len,CareBeginDate_day_format01,CareBeginDate_day_format02,CareBeginDate_day_format03,CareBeginDate_month_format01,CareBeginDate_month_format02,CareBeginDate_month_format03,CareBeginDate_year_format01,CareBeginDate_year_format02,CareBeginDate_hour_format01,CareBeginDate_hour_format02,CareBeginDate_hour_format03,CareBeginDate_hour_format04,CareBeginDate_hour_format05,CareBeginDate_hour_format06
0,f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-...,"[le premier avril, à 8h15., dans 10 jours, 10 ...",4,jeudi,1,1,avril,4,4,21,2021,8h00,8h,8.0,8h00,8h,8
1,b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-...,[],0,mardi,22,22,mars,3,3,22,2022,7h15,7h,7.0,7h15,7h,7
2,9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-...,[],0,lundi,24,24,janvier,1,1,22,2022,11h00,11h,11.0,11h00,11h,11
3,2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-...,"[le 10 juillet, le 10 juillet à 16h]",2,mercredi,10,10,juillet,7,7,19,2019,16h00,16h,16.0,4h00,4h,4
4,9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-...,[le 25 septembre à 8h],1,mercredi,25,25,septembre,9,9,19,2019,8h30,8h,8.0,8h30,8h,8
5,55d674cc-3389-4cf6-ab7c-1f1b9fa1b6ed_4ce94c12-...,[],0,lundi,11,11,octobre,10,10,21,2021,8h00,8h,8.0,8h00,8h,8
6,f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_790cfe40-...,[le 8 septembre à 6h],1,mercredi,8,8,septembre,9,9,21,2021,6h30,6h,6.0,6h30,6h,6
7,2d035c4b-cdfa-4982-87dc-916fe07a0824_3a20d2f2-...,[le 7 février],1,vendredi,7,7,février,2,2,20,2020,10h00,10h,10.0,10h00,10h,10
8,74b154c1-e265-4cb9-8e71-0c9bbc3dc880_5ea80fbe-...,[le lundi le 21 février à 11h],1,vendredi,25,25,février,2,2,22,2022,11h00,11h,11.0,11h00,11h,11
9,2d035c4b-cdfa-4982-87dc-916fe07a0824_0509f8ab-...,"[le 1709, 2020 à 6h30, à]",1,jeudi,17,17,septembre,9,9,20,2020,6h30,6h,6.0,6h30,6h,6


In [26]:
df_score = df[["filename_cleaned", "words"]].copy()

df_score["score_day_format01"] = df.apply(lambda x: x["CareBeginDate_day_format01"] in " ".join(x["words"]), axis=1)
df_score["score_day_format02"] = df.apply(lambda x: x["CareBeginDate_day_format02"] in " ".join(x["words"]), axis=1)
df_score["score_day_format03"] = df.apply(lambda x: x["CareBeginDate_day_format03"] in " ".join(x["words"]), axis=1)
df_score["score_month_format01"] = df.apply(lambda x: x["CareBeginDate_month_format01"] in " ".join(x["words"]), axis=1)
df_score["score_month_format02"] = df.apply(lambda x: x["CareBeginDate_month_format02"] in " ".join(x["words"]), axis=1)
df_score["score_month_format03"] = df.apply(lambda x: x["CareBeginDate_month_format03"] in " ".join(x["words"]), axis=1)
df_score["score_year_format01"] = df.apply(lambda x: x["CareBeginDate_year_format01"] in " ".join(x["words"]), axis=1)
df_score["score_year_format02"] = df.apply(lambda x: x["CareBeginDate_year_format02"] in " ".join(x["words"]), axis=1)
df_score["score_hour_format01"] = df.apply(lambda x: x["CareBeginDate_hour_format01"] in " ".join(x["words"]), axis=1)
df_score["score_hour_format02"] = df.apply(lambda x: x["CareBeginDate_hour_format02"] in " ".join(x["words"]), axis=1)
df_score["score_hour_format03"] = df.apply(lambda x: x["CareBeginDate_hour_format03"] in " ".join(x["words"]), axis=1)
df_score["score_hour_format04"] = df.apply(lambda x: x["CareBeginDate_hour_format04"] in " ".join(x["words"]), axis=1)
df_score["score_hour_format05"] = df.apply(lambda x: x["CareBeginDate_hour_format05"] in " ".join(x["words"]), axis=1)
df_score["score_hour_format06"] = df.apply(lambda x: x["CareBeginDate_hour_format06"] in " ".join(x["words"]), axis=1)

df_score["score_day"] = \
    df_score["score_day_format01"] | \
    df_score["score_day_format02"] | \
    df_score["score_day_format03"]

df_score["score_month"] = \
    df_score["score_month_format01"] | \
    df_score["score_month_format02"] | \
    df_score["score_month_format03"]

df_score["score_month"] = \
    df_score["score_month_format01"] | \
    df_score["score_month_format02"] | \
    df_score["score_month_format03"]

df_score["score_year"] = \
    df_score["score_year_format01"] | \
    df_score["score_year_format02"]

df_score["score_hour"] = \
    df_score["score_hour_format01"] | \
    df_score["score_hour_format02"] | \
    df_score["score_hour_format03"] | \
    df_score["score_hour_format04"] | \
    df_score["score_hour_format05"] | \
    df_score["score_hour_format06"]

df_score.head()

Unnamed: 0,filename_cleaned,words,score_day_format01,score_day_format02,score_day_format03,score_month_format01,score_month_format02,score_month_format03,score_year_format01,score_year_format02,score_hour_format01,score_hour_format02,score_hour_format03,score_hour_format04,score_hour_format05,score_hour_format06,score_day,score_month,score_year,score_hour
0,f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-...,"[le premier avril, à 8h15., dans 10 jours, 10 ...",False,False,True,True,False,False,False,False,False,True,True,False,True,True,True,True,False,True
1,b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-...,[],False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-...,[],False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-...,"[le 10 juillet, le 10 juillet à 16h]",False,True,True,True,False,False,False,False,False,True,True,False,False,False,True,True,False,True
4,9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-...,[le 25 septembre à 8h],False,True,True,True,False,False,False,False,False,True,True,False,True,True,True,True,False,True


In [27]:
df_score["score"] = (df_score["score_day"].astype(int) + df_score["score_month"].astype(int) + \
                     df_score["score_year"].astype(int) + df_score["score_hour"].astype(int) ) / 4

df_score.head()

Unnamed: 0,filename_cleaned,words,score_day_format01,score_day_format02,score_day_format03,score_month_format01,score_month_format02,score_month_format03,score_year_format01,score_year_format02,...,score_hour_format02,score_hour_format03,score_hour_format04,score_hour_format05,score_hour_format06,score_day,score_month,score_year,score_hour,score
0,f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-...,"[le premier avril, à 8h15., dans 10 jours, 10 ...",False,False,True,True,False,False,False,False,...,True,True,False,True,True,True,True,False,True,0.75
1,b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-...,[],False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.0
2,9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-...,[],False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.0
3,2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-...,"[le 10 juillet, le 10 juillet à 16h]",False,True,True,True,False,False,False,False,...,True,True,False,False,False,True,True,False,True,0.75
4,9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-...,[le 25 septembre à 8h],False,True,True,True,False,False,False,False,...,True,True,False,True,True,True,True,False,True,0.75


In [29]:
df_score[["words", "score_day", "score_month", "score_year", "score_hour", "score"]].head(20)

Unnamed: 0,words,score_day,score_month,score_year,score_hour,score
0,"[le premier avril, à 8h15., dans 10 jours, 10 ...",True,True,False,True,0.75
1,[],False,False,False,False,0.0
2,[],False,False,False,False,0.0
3,"[le 10 juillet, le 10 juillet à 16h]",True,True,False,True,0.75
4,[le 25 septembre à 8h],True,True,False,True,0.75
5,[],False,False,False,False,0.0
6,[le 8 septembre à 6h],True,True,False,True,0.75
7,[le 7 février],True,True,False,False,0.5
8,[le lundi le 21 février à 11h],False,True,False,True,0.5
9,"[le 1709, 2020 à 6h30, à]",True,True,True,True,1.0


In [31]:
df_score["filename_cleaned"].to_list()

['f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-ce20-421f-a546-ef11f5d30541',
 'b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-2ecb-4256-a826-0d28e40904d3',
 '9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-df22-4312-bcbe-fd2f32294cee',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-e026-4c98-a4a9-5c31c09bc0b4',
 '9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-8646-4b45-a1a4-06cdb747f8fc',
 '55d674cc-3389-4cf6-ab7c-1f1b9fa1b6ed_4ce94c12-e92c-476e-a27b-66e2e10488b9',
 'f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_790cfe40-a13d-4719-8f94-1b5bc308102c',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_3a20d2f2-def7-4abd-a0ec-5e5a8ffb773b',
 '74b154c1-e265-4cb9-8e71-0c9bbc3dc880_5ea80fbe-1cc2-4b8a-af5e-cc01107c4320',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_0509f8ab-e39b-4e8a-ae5c-ab2acd708867',
 '2206f47b-a858-4f23-8696-d10e1050f2d7_3af26627-bc72-4c41-9254-ae7e257d335c',
 '2d035c4b-cdfa-4982-87dc-916fe07a0824_0b69efec-9b2c-4514-ba67-4068f637f88e',
 '9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_6ff09926-7b3e-4c59-bbc0-5

In [33]:
df_score.head(20)

Unnamed: 0,filename_cleaned,words,score_day_format01,score_day_format02,score_day_format03,score_month_format01,score_month_format02,score_month_format03,score_year_format01,score_year_format02,...,score_hour_format02,score_hour_format03,score_hour_format04,score_hour_format05,score_hour_format06,score_day,score_month,score_year,score_hour,score
0,f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_134e5aa4-...,"[le premier avril, à 8h15., dans 10 jours, 10 ...",False,False,True,True,False,False,False,False,...,True,True,False,True,True,True,True,False,True,0.75
1,b96ad235-2f4f-4236-9aa3-9df85dfcf7cb_1bdc3625-...,[],False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.0
2,9f980dcf-b431-4e67-876f-2b8e288b7900_81e9bb37-...,[],False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.0
3,2d035c4b-cdfa-4982-87dc-916fe07a0824_4aa3c092-...,"[le 10 juillet, le 10 juillet à 16h]",False,True,True,True,False,False,False,False,...,True,True,False,False,False,True,True,False,True,0.75
4,9d42ce6f-8537-49a9-a121-c8ae1dc7cda8_403bfcb4-...,[le 25 septembre à 8h],False,True,True,True,False,False,False,False,...,True,True,False,True,True,True,True,False,True,0.75
5,55d674cc-3389-4cf6-ab7c-1f1b9fa1b6ed_4ce94c12-...,[],False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0.0
6,f0caa21b-c11d-40a3-98ec-e2df3d2b9cc4_790cfe40-...,[le 8 septembre à 6h],False,False,True,True,False,False,False,False,...,True,True,False,True,True,True,True,False,True,0.75
7,2d035c4b-cdfa-4982-87dc-916fe07a0824_3a20d2f2-...,[le 7 février],False,False,True,True,False,False,False,False,...,False,False,False,False,False,True,True,False,False,0.5
8,74b154c1-e265-4cb9-8e71-0c9bbc3dc880_5ea80fbe-...,[le lundi le 21 février à 11h],False,False,False,True,False,True,False,False,...,True,True,False,True,True,False,True,False,True,0.5
9,2d035c4b-cdfa-4982-87dc-916fe07a0824_0509f8ab-...,"[le 1709, 2020 à 6h30, à]",False,True,True,False,True,True,True,True,...,True,True,True,True,True,True,True,True,True,1.0
