In [4]:
import sys
sys.path.append('./src')
from evalner import *

In [2]:
processes = 128
percentPeriod = ONE / TWO
loggerFormat = "[%(levelname)-7s] [%(asctime)s, %(process)6s]: %(message)s"
loggerDateFormat = "%Y/%m/%d %H:%M:%S"
loggerName = "logger"
logLevel = "INFO"
inputDataDirectory = "./data/data.csv"
modelProcessingData = "./data/modelData.csv"
comparableData = "./data/preparedData.csv"
experimentName = "DiseaseDetection"
taskName1 = "PrepareData"
taskName2 = "prepareDataForComparison"
skipIfAlreadyDone = True

tasks1 = [
    [deleteNaN, "001deleteNaN", None],
    [deleteWords, "002deleteEmptyWords", ["", "``"]],
    [deleteDocuments, "003DelteDoc132", [132]],
    [replaceBIOFormat, "004replacingLabelFormat", {
        "B-indications": "B-DISEASE", 
        "I-indications": "I-DISEASE", 
        "O": "0"
    }]
]

tasks2 = [
    [splitAtIndex, "001splitAtMinus", "-"], 
    [splitAtIndex, "002splitAtSlash", "/"], 
    [splitAtIndex, "003splitAtEqualSign", "="], 
    [splitAtIndex, "004splitAtPeriod", "."], 
    [splitAtIndex, "005splitAtDoublePoint", ":"], 
    [splitAtIndex, "006splitAtSingleQuote", "'"], 
    [splitAtIndex, "007splitAtPlus", "+"], 
    [splitAtIndex, "008splitAtPoint", "·"], 
    [splitAtIndex, "009splitAtMinus2", "–"], 
    [splitAtIndex, "010splitAtStar", "*"], 
    [splitAtIndex, "011splitAtMinus3", "—"], 
    [splitAtIndex, "012splitAtComma", ","],
    [splitAtIndex, "013splitAtTilde", "~"],
    [splitAtIndex, "014splitAtBigPoint", "•"],
    [splitAtIndex, "015splitAtUnderscore", "_"],
    [splitAtIndex, "016splitAtMinus4", "‑"],
    [splitAtIndex, "017splitAtTriplePoint", "…"],
    [splitAtIndex, "018splitAtSingleQuote2", "׳"],
    [deleteNaN, "019deleteNaN", None]
]

wordCol = "Word"
labelCol = "tag"
wordIDCol = "id"
docIDCol = "Doc_ID"
sentIDCol = "Sent_ID"

In [3]:
log = Logger()
ret = log.createLogger(loggerName, loggerFormat, loggerDateFormat, logLevel)
ret = Logger.setSingletonLogger(log)
ret = log.setPrintPercentPeriod(percentPeriod)
ret = log.printInfo("Logger initialized.")

[INFO   ] [2024/06/07 07:46:51, 3303711]: Logger initialized.


In [4]:
tasklist = TaskList()
tasklist.setInputData(inputDataDirectory).eval()
tasklist.setOutputData(modelProcessingData).eval()
tasklist.setExperimentName(experimentName).eval()
tasklist.setRunName(taskName1).eval()
tasklist.setColumns(wordCol, labelCol, wordIDCol, docIDCol, sentIDCol).eval()

In [5]:
for task in tasks1:
    tasklist.newTask(task[ONE], task[ZERO], task[TWO]).eval()
    t = tasklist.getTask(len(tasklist.getTaskList()) - ONE)
    t.setSkipIfAlreadyDone(skipIfAlreadyDone)
    t.setStandardFiles(processes).eval()

In [6]:
tasklist.start().eval()

[INFO   ] [2024/06/07 07:46:51, 3303711]: Starting Task "001deleteNaN".
[INFO   ] [2024/06/07 07:46:51, 3303711]: Splitting the data.
[INFO   ] [2024/06/07 07:46:58, 3303711]: Starting Task "002deleteEmptyWords".
[INFO   ] [2024/06/07 07:46:58, 3303711]: Copy data for next task.
[INFO   ] [2024/06/07 07:46:58, 3303711]: Starting Task "003DelteDoc132".
[INFO   ] [2024/06/07 07:46:58, 3303711]: Copy data for next task.
[INFO   ] [2024/06/07 07:46:58, 3303711]: Starting Task "004replacingLabelFormat".
[INFO   ] [2024/06/07 07:46:58, 3303711]: Copy data for next task.
[INFO   ] [2024/06/07 07:46:58, 3303711]: Merging 128 output files into "./data/modelData.csv".


In [7]:
data = Data()
data.loadCSVData(modelProcessingData).eval()
data.setDocumentIDColumn(docIDCol).eval()
data.setSentenceIDColumn(sentIDCol).eval()
data.setWordIDColumn(wordIDCol).eval()
data.setWordColumn(wordCol).eval()
data.setLabelColumn(labelCol).eval()
sentences = createSentences(data, 
    [".", "?", "!", ",", ";", "-", ":", "'", "%", ")", "}", "]"], 
    ["(", "[", "{", "-", "'"]
)

[INFO   ] [2024/06/07 07:51:55, 3303711]: ( 95639 | 191278)   50.0% completed. [Residual Time: ~00 days, 00:04:44]
[INFO   ] [2024/06/07 07:56:41, 3303711]: (191278 | 191278)  100.0% completed. [Residual Time: ~00 days, 00:00:00]


In [8]:
tasklist = TaskList()
tasklist.setInputData(modelProcessingData).eval()
tasklist.setOutputData(comparableData).eval()
tasklist.setExperimentName(experimentName).eval()
tasklist.setRunName(taskName2).eval()
tasklist.setColumns(wordCol, labelCol, wordIDCol, docIDCol, sentIDCol).eval()

In [9]:
for task in tasks2:
    tasklist.newTask(task[ONE], task[ZERO], task[TWO]).eval()
    t = tasklist.getTask(len(tasklist.getTaskList()) - ONE)
    t.setSkipIfAlreadyDone(skipIfAlreadyDone)
    t.setStandardFiles(processes).eval()

In [11]:
tasklist.start().eval()

[INFO   ] [2024/06/07 07:56:46, 3303711]: Starting Task "001splitAtMinus".
[INFO   ] [2024/06/07 07:56:46, 3303711]: Splitting the data.
[INFO   ] [2024/06/07 07:56:56, 3304613]: Loading data from "./experiments/DiseaseDetection/prepareDataForComparison/001splitAtMinus/in/input0.csv".
[INFO   ] [2024/06/07 07:56:56, 3304616]: Loading data from "./experiments/DiseaseDetection/prepareDataForComparison/001splitAtMinus/in/input1.csv".
[INFO   ] [2024/06/07 07:56:56, 3304619]: Loading data from "./experiments/DiseaseDetection/prepareDataForComparison/001splitAtMinus/in/input2.csv".
[INFO   ] [2024/06/07 07:56:56, 3304613]: Data loaded: 35496 rows in 9 columns.
[INFO   ] [2024/06/07 07:56:56, 3304624]: Loading data from "./experiments/DiseaseDetection/prepareDataForComparison/001splitAtMinus/in/input3.csv".
[INFO   ] [2024/06/07 07:56:56, 3304616]: Data loaded: 35497 rows in 9 columns.
[INFO   ] [2024/06/07 07:56:56, 3304627]: Loading data from "./experiments/DiseaseDetection/prepareDataForC