In [1]:
from datetime              import datetime, timedelta
from transformers          import AutoTokenizer
from transformers          import AutoModelForTokenClassification
from transformers          import pipeline

import math
import torch
import pandas as pd

from ipynb.fs.full.Values                              import *
from ipynb.fs.full.Logger                              import *
from ipynb.fs.full.ReturnValue                         import *
from ipynb.fs.full.Utils                               import *
from ipynb.fs.full.Data                                import *
from ipynb.fs.full.DataManipulation                    import *
from ipynb.fs.full.TaskList                            import *
from ipynb.fs.full.Task                                import *


In [2]:
processes = 1
percentPeriod = ONE / TEN
loggerFormat = "[%(levelname)-7s] [%(asctime)s, %(process)6s]: %(message)s"
loggerDateFormat = "%Y/%m/%d %H:%M:%S"
loggerName = "logger"
logLevel = "INFO"
dataDirectory = "./data/data.csv"
outputData = "./data/preparedData.csv"
experimentName = "DiseaseDetection"
taskName = "DataPreparation"
skipIfAlreadyDone = True


splitAtParams = [
    ["001splitAtMinus", "-"], 
    ["002splitAtSlash", "/"], 
    ["003splitAtEqualSign", "="], 
    ["004splitAtPeriod", "."], 
    ["005splitAtDoublePoint", ":"], 
    ["006splitAtSingleQuote", "'"], 
    ["007splitAtPlus", "+"], 
    ["008splitAtPoint", "·"], 
    ["009splitAtMinus2", "–"], 
    ["010splitAtStar", "*"], 
    ["011splitAtMinus3", "—"], 
    ["012splitAtComma", ","],
    ["013splitAtTilde", "~"],
    ["014splitAtBigPoint", "•"],
    ["015splitAtUnderscore", "_"],
    ["016splitAtMinus4", "‑"],
    ["017splitAtTriplePoint", "…"],
    ["018splitAtSingleQuote2", "׳"]
]

wordCol = "Word"
labelCol = "tag"
wordIDCol = "id"
docIDCol = "Doc_ID"
sentIDCol = "Sent_ID"

In [3]:
log = Logger()
ret = log.createLogger(loggerName, loggerFormat, loggerDateFormat, logLevel)
ret = Logger.setSingletonLogger(log)
ret = log.setPrintPercentPeriod(percentPeriod)
ret = log.printInfo("Logger initialized.")

[INFO   ] [2024/06/05 10:11:13, 134425]: Logger initialized.


In [4]:
tasklist = TaskList()
tasklist.setInputData(dataDirectory).eval()
tasklist.setOutputData(outputData).eval()
tasklist.setExperimentName(experimentName).eval()
tasklist.setRunName(taskName).eval()
tasklist.setColumns(wordCol, labelCol, wordIDCol, docIDCol, sentIDCol).eval()

In [5]:
for splitChar in splitAtParams:
    tasklist.newTask(splitChar[ZERO], splitAtIndex, splitChar[ONE]).eval()
    t = tasklist.getTask(len(tasklist.getTaskList()) - ONE)
    t.setSkipIfAlreadyDone(skipIfAlreadyDone)
    t.setStandardFiles(processes).eval()

In [6]:
tasklist.start().eval()

[INFO   ] [2024/06/05 10:11:13, 134425]: Starting Task "001splitAtMinus".
[INFO   ] [2024/06/05 10:11:13, 134425]: Splitting the data.
[INFO   ] [2024/06/05 10:11:20, 134425]: Starting Task "002splitAtSlash".
[INFO   ] [2024/06/05 10:11:20, 134425]: Copy data for next task.
[INFO   ] [2024/06/05 10:11:20, 134425]: Starting Task "003splitAtEqualSign".
[INFO   ] [2024/06/05 10:11:20, 134425]: Copy data for next task.
[INFO   ] [2024/06/05 10:11:20, 134425]: Starting Task "004splitAtPeriod".
[INFO   ] [2024/06/05 10:11:20, 134425]: Copy data for next task.
[INFO   ] [2024/06/05 10:11:20, 134425]: Starting Task "005splitAtDoublePoint".
[INFO   ] [2024/06/05 10:11:20, 134425]: Copy data for next task.
[INFO   ] [2024/06/05 10:11:20, 134425]: Starting Task "006splitAtSingleQuote".
[INFO   ] [2024/06/05 10:11:20, 134425]: Copy data for next task.
[INFO   ] [2024/06/05 10:11:20, 134425]: Starting Task "007splitAtPlus".
[INFO   ] [2024/06/05 10:11:20, 134425]: Copy data for next task.
[INFO   ]