In [1]:
from evalner import *

In [2]:
processes = 128
percentPeriod = ONE / TWO
loggerFormat = "[%(levelname)-7s] [%(asctime)s, %(process)6s]: %(message)s"
loggerDateFormat = "%Y/%m/%d %H:%M:%S"
loggerName = "logger"
logLevel = "INFO"
inputDataDirectory = "./data/data100.csv"
modelProcessingData = "./data/modelData.csv"
comparableData = "./data/preparedData.csv"
experimentName = "DiseaseDetection100"
taskName1 = "prepareData"
taskName2 = "prepareDataForComparison"
skipIfAlreadyDone = True
autoTokenizerName = "alvaroalon2/biobert_diseases_ner"
modelName = "alvaroalon2/biobert_diseases_ner"
modelTaskName = "ner"
modelAggregationStrategy = "none"
modelResultDirectory = "./data/modelResult.csv"
noPreSpaceCharacters = [".", "?", "!", ",", ";", "-", ":", "'", "%", ")", "}", "]"]
noPostSpaceCharacters = ["(", "[", "{", "-", "'"]


tasks1 = [
    [deleteNaN, "001deleteNaN", None],
    [removeNonASCIIWords, "002deleteNonASCIIWords", ""],
    [deleteWords, "003deleteEmptyWords", ["", "``"]],
    [replaceBIOFormat, "004replacingLabelFormat", {
        "B-indications": "B-DISEASE", 
        "I-indications": "I-DISEASE", 
        "O": "0"
    }],
    [deleteNaN, "005deleteNaN2", None],
]

tasks2 = [
    [splitAtIndex, "001splitAtMinus", "-"], 
    [splitAtIndex, "002splitAtSlash", "/"], 
    [splitAtIndex, "003splitAtEqualSign", "="], 
    [splitAtIndex, "004splitAtPeriod", "."], 
    [splitAtIndex, "005splitAtDoublePoint", ":"], 
    [splitAtIndex, "006splitAtSingleQuote", "'"], 
    [splitAtIndex, "007splitAtPlus", "+"], 
    [splitAtIndex, "008splitAtPoint", "·"], 
    [splitAtIndex, "009splitAtMinus2", "–"], 
    [splitAtIndex, "010splitAtStar", "*"], 
    [splitAtIndex, "011splitAtMinus3", "—"], 
    [splitAtIndex, "012splitAtComma", ","],
    [splitAtIndex, "013splitAtTilde", "~"],
    [splitAtIndex, "014splitAtBigPoint", "•"],
    [splitAtIndex, "015splitAtUnderscore", "_"],
    [splitAtIndex, "016splitAtMinus4", "‑"],
    [splitAtIndex, "017splitAtTriplePoint", "…"],
    [splitAtIndex, "018splitAtSingleQuote2", "׳"],
    [splitAtIndex, "019splitAtPipe", "|"],
    [splitAtIndex, "020splitAtSingleQuote3", "`"],
    [deleteCharacters, "021deleteCharacters", ""],
    [splitAtIndex, "022splitAtSingleBackslash", "\\"],
    [splitAtIndex, "023splitAtSingleBackslash", "^"],
    [deleteWords, "024deleteNull", ["null", "nan", "NA"]],
    [deleteNaN, "025deleteNaN3", None],
]

wordCol = "Word"
labelCol = "tag"
wordIDCol = "id"
docIDCol = "Doc_ID"
sentIDCol = "Sent_ID"

modelWordCol = "word"
modelLabelCol = "entity"

In [3]:
log = Logger()
ret = log.createLogger(loggerName, loggerFormat, loggerDateFormat, logLevel)
ret = Logger.setSingletonLogger(log)
ret = log.setPrintPercentPeriod(percentPeriod)
ret = log.printInfo("Logger initialized.")

[INFO   ] [2024/07/12 12:39:08, 3626728]: Logger initialized.


In [4]:
tasklist = TaskList()
tasklist.setInputData(inputDataDirectory).eval()
tasklist.setOutputData(modelProcessingData).eval()
tasklist.setExperimentName(experimentName).eval()
tasklist.setRunName(taskName1).eval()
tasklist.setColumns(wordCol, labelCol, wordIDCol, docIDCol, sentIDCol).eval()

In [5]:
for task in tasks1:
    tasklist.newTask(task[ONE], task[ZERO], task[TWO]).eval()
    t = tasklist.getTask(len(tasklist.getTaskList()) - ONE)
    t.setSkipIfAlreadyDone(skipIfAlreadyDone)
    t.setStandardFiles(processes).eval()

In [6]:
tasklist.start().eval()

[INFO   ] [2024/07/12 12:39:08, 3626728]: Starting Task "001deleteNaN".
[INFO   ] [2024/07/12 12:39:08, 3626728]: Splitting the data.
[INFO   ] [2024/07/12 12:39:15, 3626728]: Starting Task "002deleteNonASCIIWords".
[INFO   ] [2024/07/12 12:39:15, 3626728]: Copy data for next task.
[INFO   ] [2024/07/12 12:39:16, 3626728]: Starting Task "003deleteEmptyWords".
[INFO   ] [2024/07/12 12:39:16, 3626728]: Copy data for next task.
[INFO   ] [2024/07/12 12:39:16, 3626728]: Starting Task "004replacingLabelFormat".
[INFO   ] [2024/07/12 12:39:16, 3626728]: Copy data for next task.
[INFO   ] [2024/07/12 12:39:16, 3626728]: Starting Task "005deleteNaN2".
[INFO   ] [2024/07/12 12:39:16, 3626728]: Copy data for next task.
[INFO   ] [2024/07/12 12:39:16, 3626728]: Merging 128 output files into "./data/modelData.csv".


In [7]:
data = Data()
data.loadCSVData(modelProcessingData).eval()
data.setDocumentIDColumn(docIDCol).eval()
data.setSentenceIDColumn(sentIDCol).eval()
data.setWordIDColumn(wordIDCol).eval()
data.setWordColumn(wordCol).eval()
data.setLabelColumn(labelCol).eval()
sentences = createSentences(data, noPreSpaceCharacters, noPostSpaceCharacters)

In [None]:
tasklist = TaskList()
tasklist.setInputData(modelProcessingData).eval()
tasklist.setOutputData(comparableData).eval()
tasklist.setExperimentName(experimentName).eval()
tasklist.setRunName(taskName2).eval()
tasklist.setColumns(wordCol, labelCol, wordIDCol, docIDCol, sentIDCol).eval()

In [None]:
for task in tasks2:
    tasklist.newTask(task[ONE], task[ZERO], task[TWO]).eval()
    t = tasklist.getTask(len(tasklist.getTaskList()) - ONE)
    t.setSkipIfAlreadyDone(skipIfAlreadyDone)
    t.setStandardFiles(processes).eval()

In [None]:
tasklist.start().eval()

[INFO   ] [2024/07/12 12:05:02, 3543722]: Starting Task "001splitAtMinus".
[INFO   ] [2024/07/12 12:05:02, 3543722]: Splitting the data.
[INFO   ] [2024/07/12 12:05:13, 3558024]: Loading data from "./experiments/DiseaseDetectionGPT/prepareDataForComparison/001splitAtMinus/in/input0.csv".
[INFO   ] [2024/07/12 12:05:13, 3558027]: Loading data from "./experiments/DiseaseDetectionGPT/prepareDataForComparison/001splitAtMinus/in/input1.csv".
[INFO   ] [2024/07/12 12:05:13, 3558030]: Loading data from "./experiments/DiseaseDetectionGPT/prepareDataForComparison/001splitAtMinus/in/input2.csv".
[INFO   ] [2024/07/12 12:05:13, 3558033]: Loading data from "./experiments/DiseaseDetectionGPT/prepareDataForComparison/001splitAtMinus/in/input3.csv".
[INFO   ] [2024/07/12 12:05:13, 3558024]: Data loaded: 35464 rows in 10 columns.
[INFO   ] [2024/07/12 12:05:13, 3558036]: Loading data from "./experiments/DiseaseDetectionGPT/prepareDataForComparison/001splitAtMinus/in/input4.csv".
[INFO   ] [2024/07/12 

In [None]:
model = pipeline(
    task = modelTaskName, 
    model = AutoModelForTokenClassification.from_pretrained(modelName), 
    tokenizer = AutoTokenizer.from_pretrained(autoTokenizerName), 
    aggregation_strategy = modelAggregationStrategy
)

applyModel(model, sentences, modelResultDirectory)

In [None]:
modelData = Data()
modelData.loadCSVData(modelResultDirectory).eval()

modelData.setWordColumn(modelWordCol).eval()
modelData.setLabelColumn(modelLabelCol).eval()

modelResults = modelData.getData()

In [None]:
wordCount = len(modelResults)

for wordIndex in range(ONE, len(modelData.getData())):
    if str(modelData.getData()[modelData.getWordColumn()][wordIndex]).startswith(TOKENSTARTCHARS):
        wordCount -= ONE

print(wordCount)

4919815


In [None]:

Logger.getSingletonLogger().startPrintProgress(len(modelData.getData()))

resultWords = [EMPTYSTRING] * wordCount
resultLable = [EMPTYSTRING] * wordCount

currentWord = -ONE
for sentenceIndex in range(ZERO, len(modelData.getData())):
    if str(modelData.getData()[modelData.getWordColumn()][sentenceIndex]).startswith(TOKENSTARTCHARS):
        resultWords[currentWord] = resultWords[currentWord] + str(modelData.getData()[modelData.getWordColumn()][sentenceIndex][TWO:])
    else:
        currentWord = currentWord + ONE
        resultWords[currentWord] = str(modelData.getData()[modelData.getWordColumn()][sentenceIndex])
        resultLable[currentWord] = str(modelData.getData()[modelData.getLabelColumn()][sentenceIndex])
            
    Logger.getSingletonLogger().printProgress()

[INFO   ] [2024/07/12 07:35:01, 3494495]: (3228473 | 6456946)   50.0% completed. [Residual Time: ~00 days, 00:00:43]
[INFO   ] [2024/07/12 07:35:45, 3494495]: (6456946 | 6456946)  100.0% completed. [Residual Time: ~00 days, 00:00:00]


In [None]:
data = Data()
data.loadCSVData(comparableData).eval()
data.setWordColumn(wordCol).eval()

  self.__data = pd.read_csv(directory, keep_default_na = False)


In [None]:
resultData = Data()
resultData.addColumn(resultWords, wordCol).eval()
resultData.addColumn(resultLable, labelCol).eval()
resultData.setLabelColumn(labelCol).eval()
resultData.setWordColumn(wordCol).eval()
noprint = resultData.getData().reset_index(drop = True, inplace = True)

In [None]:
removeWords(resultData, ["", "null", "nan", "NA"]).eval()
removeWords(data, ["", "null", "nan", "NA"]).eval()

In [None]:
data.getData().drop(list(range(842166, 842209)), inplace = True)
resultData.getData().drop(list(range(842166, 842179)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)
resultData.getData().reset_index(drop = True, inplace = True)

In [None]:
data.getData().drop(list(range(2466352, 2472866)), inplace = True)
resultData.getData().drop(list(range(2466352, 2467110)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)
resultData.getData().reset_index(drop = True, inplace = True)

In [None]:
data.getData().drop(list(range(2601136, 2607832)), inplace = True)
resultData.getData().drop(list(range(2601136, 2602120)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)
resultData.getData().reset_index(drop = True, inplace = True)

In [None]:
data.getData().drop(list(range(3326116, 3326299)), inplace = True)
resultData.getData().drop(list(range(3326116, 3326119)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)
resultData.getData().reset_index(drop = True, inplace = True)

In [None]:
data.getData().drop(list(range(3391151, 3391211)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)
resultData.getData().drop(list(range(3391151, 3391160)), inplace = True)
resultData.getData().reset_index(drop = True, inplace = True)

In [None]:
data.getData().drop(list(range(3659720, 3659794)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)
resultData.getData().drop(list(range(3659721, 3659721)), inplace = True)
resultData.getData().reset_index(drop = True, inplace = True)

In [None]:
data.getData().drop(list(range(3736982, 3737005)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)

In [None]:
data.getData().drop(list(range(4338525, 4338554)), inplace = True)
data.getData().reset_index(drop = True, inplace = True)

In [None]:
print(len(resultData.getData()))
print(data.getRowCount())

4918008
4918008


In [None]:
diffWord(data, resultData.getData()[resultData.getWordColumn()].reset_index(drop = True), 40, 4338525).eval()

In [None]:
matches = 0

data.getData().reset_index(drop = True, inplace = True)
resultData.getData().reset_index(drop = True, inplace = True)

length = min(len(data.getData()), len(resultData.getData()))
misMatchesResultDataAccuracy = {
    "B-DISEASE": ZERO, 
    "I-DISEASE": ZERO, 
    "0": ZERO
}
misMatchesResultDataRecall = {
    "B-DISEASE": ZERO, 
    "I-DISEASE": ZERO, 
    "0": ZERO
}

bimismatch = ZERO

totalLabels = {
    "B-DISEASE": ZERO, 
    "I-DISEASE": ZERO, 
    "0": ZERO
}

for index in range(ZERO, length - ONE):
    if (data.getData()[labelCol][index] == resultData.getData()[labelCol][index]):
        matches += ONE
    else:
        # Recall
        if (data.getData()[labelCol][index] == "0"):
            misMatchesResultDataRecall[resultData.getData()[labelCol][index]] += ONE
        # Accuracy
        else:
            if (resultData.getData()[labelCol][index] == "0"):
                misMatchesResultDataAccuracy[resultData.getData()[labelCol][index]] += ONE
            else:
                bimismatch += ONE

    totalLabels[resultData.getData()[labelCol][index]] += ONE

In [None]:
print(matches / length)
print(misMatchesResultDataRecall)
print(misMatchesResultDataAccuracy)
print(bimismatch)
print(totalLabels)

0.9766088221084634
{'B-DISEASE': 50047, 'I-DISEASE': 23726, '0': 0}
{'B-DISEASE': 0, 'I-DISEASE': 0, '0': 29977}
11287
{'B-DISEASE': 89008, 'I-DISEASE': 61642, '0': 4767357}
