This is the code for the final NLP pipeline

In [1]:
#Converting the unstructured data into a semistructured data set so we can understand the prefixes to the doctor's notes

import pandas as pd
import re

# Path to the text file containing doctor's notes
file_path = 'AnonymizedClinicalAbbreviationsAndAcronymsDataSet.txt'

# Read the content of the file with explicit encoding specification
with open(file_path, 'r', encoding='ISO-8859-1') as file:
    notes_content = file.read()

# Split words based on "|" or new line
split_words = re.split(r'\||\n', notes_content)

# Remove empty strings from the list
split_words = [word.strip() for word in split_words]

# Reshape the list into a DF with 7 columns
num_columns = 7
num_rows = len(split_words) // num_columns
reshaped_words = [split_words[i:i + num_columns] for i in range(0, len(split_words), num_columns)]

# Create DF from the reshaped list
doctorsnotes = pd.DataFrame(reshaped_words, columns=[f'Part {i + 1}' for i in range(num_columns)])

num_records = len(doctorsnotes)
print(f'Number of records in doctorsnotes: {num_records}')


doctorsnotes.tail()

Number of records in doctorsnotes: 37501


Unnamed: 0,Part 1,Part 2,Part 3,Part 4,Part 5,Part 6,Part 7
37496,VAD,vincristine adriamycin and dexamethasone,"VAD,",172.0,175.0,IMPRESSION,He has been receiving weekly Procrit injection...
37497,VAD,vincristine adriamycin and dexamethasone,VAD,250.0,252.0,HISTORY OF PRESENT ILLNESS,"Within a month, he developed recurrent hip pai..."
37498,VAD,vincristine adriamycin and dexamethasone,VAD,181.0,183.0,HISTORY OF PRESENT ILLNESS,He had a serum protein electrophoresis with a ...
37499,VAD,vincristine adriamycin and dexamethasone,VAD,60.0,62.0,PROCEDURES,DISCHARGE DIAGNOSES: Multiple myeloma. PROCEDU...
37500,,,,,,,


In [3]:
#check for presence of one or more conditions in medical note - create separate csvs for each combination that exists in the dataset


from itertools import combinations
import pandas as pd
import csv

word_combinations = ["asthma", "iron def", "pneumoni", "substance", "fibrosis", "malnutr"]


all_combinations = []
for r in range(1, len(word_combinations) + 1):
    all_combinations.extend(combinations(word_combinations, r))

occurrences_count = {combination: 0 for combination in all_combinations}

for entry in doctorsnotes['Part 7']:
    if entry is not None:
        for combination in all_combinations:
            #implicit tokenization
            if all(word in str(entry).lower() for word in combination):
                occurrences_count[combination] += 1

# Iterate through occurrences and write to separate CSV files
for combination, count in occurrences_count.items():
    if count != 0:
        print(f"Occurrences of '{combination}': {count}")

        # Create a new CSV file for each set of occurrences
        file_name = f'{combination}.csv'
        with open(file_name, 'w', newline='') as file:
            csv_writer = csv.writer(file, delimiter='\t')

            # Write columns up to 'Part 7'
            csv_writer.writerow(doctorsnotes.columns)
            
            # Write data rows for the current combination
            for _, row in doctorsnotes.iterrows():
                if row['Part 7'] is not None and all(word in str(row['Part 7']).lower() for word in combination):
                    csv_writer.writerow(row)

        print(f'Filtered notes saved to: {file_name}')

# Get the number of records in doctorsnotes
num_records = len(doctorsnotes)
print(f'Number of records in doctorsnotes: {num_records}')

#duplicates were removed 

Occurrences of '('asthma',)': 326
Filtered notes saved to: ('asthma',).csv
Occurrences of '('iron def',)': 51
Filtered notes saved to: ('iron def',).csv
Occurrences of '('pneumoni',)': 763
Filtered notes saved to: ('pneumoni',).csv
Occurrences of '('substance',)': 112
Filtered notes saved to: ('substance',).csv
Occurrences of '('fibrosis',)': 116
Filtered notes saved to: ('fibrosis',).csv
Occurrences of '('malnutr',)': 46
Filtered notes saved to: ('malnutr',).csv
Occurrences of '('asthma', 'iron def')': 1
Filtered notes saved to: ('asthma', 'iron def').csv
Occurrences of '('asthma', 'pneumoni')': 22
Filtered notes saved to: ('asthma', 'pneumoni').csv
Occurrences of '('asthma', 'substance')': 8
Filtered notes saved to: ('asthma', 'substance').csv
Occurrences of '('asthma', 'fibrosis')': 1
Filtered notes saved to: ('asthma', 'fibrosis').csv
Occurrences of '('asthma', 'malnutr')': 1
Filtered notes saved to: ('asthma', 'malnutr').csv
Occurrences of '('iron def', 'pneumoni')': 1
Filtered no

In [5]:
#Establish whether patients have the conditions or not

import pandas as pd
from nltk import word_tokenize

def detect_conditions(tokens):
    # Define lists of diagnostic n-grams
    diag_ngrams = ["asthma", "pneumoni", "substance", "fibrosis", "malnutr"]

    # Initialize variables
    detected_ngrams = []

    # Perform keyword matching
    for keyword in diag_ngrams:
        for i, token in enumerate(tokens):
            if keyword in token.lower():
                detection = 1
                detected_ngram_index = i
                detected_ngrams.append([keyword, detection, detected_ngram_index, 0])
                break
      
    
    for i, token in enumerate(tokens[:-1]):
      # Check for the specific sequence "iron def"
        if i < len(tokens) - 1 and token.lower() == "iron" and (tokens[i+1].lower() == "deficient" or tokens[i+1].lower() == "deficiency") :
            detected_ngrams.append(["iron def", 1, i, 0])
            
            
    negation_words = {'not', 'no', 'never', 'none', 'nothing', 'neither', 'nor', 'denies', 'negative', 'without','probable','likely','possible','maybe','probably'}

    # Check for negation words within a specific window around the detected n-grams
    for i, word in enumerate(tokens):
        for detected_ngram in detected_ngrams:
            # If negation word found within +/- 8 words from the detected n-gram
            if word.lower() in negation_words and (abs(i - detected_ngram[2]) <= 8):
                detected_ngram[1] = 2
                detected_ngram[3] = word.lower()

    
                  


            
    # Create a dictionary to store the verdict for each keyword
    verdict_dict = {}

    # Check the result for each n-gram
    for ngram_info in detected_ngrams:
        keyword = ngram_info[0]
        detection_status = "Positive" if ngram_info[1] == 1 else "Negative" if ngram_info[1] == 2 else "Error"
        verdict_dict[keyword] = detection_status

    return verdict_dict



csv_file_path = 'UnstructuredFinalNoHad.csv'


df = pd.read_csv(csv_file_path)

# Create columns for verdicts in the DF
verdict_columns = ['Verdict - asthma', 'Verdict - iron def', 'Verdict - pneumoni', 'Verdict - substance', 'Verdict - fibrosis', 'Verdict - malnutr']
for column in verdict_columns:
    df[column] = ""

# Iterate over all rows in the specified column
for i, text in enumerate(df['Part 7']):
    tokens = word_tokenize(str(text))  # Convert to string to handle potential NaN values
    #print(tokens)
    verdict_dict = detect_conditions(tokens)

    # Update DF with the verdicts
    for keyword, verdict in verdict_dict.items():
        column_name = f'Verdict - {keyword}'
        df.at[i, column_name] = verdict

# Save the updated DF to a new CSV file called Verdict
output_csv_path = 'Verdict.csv'
df.to_csv(output_csv_path, index=False)

In [6]:
#Compare to labelled and get scores

import pandas as pd
from sklearn.metrics import confusion_matrix


csv_file_path = 'Verdict.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)


correct = 363
totalcomparisons = 363 # had to set this to the number of comparisons - not the number of records!
falsenegative = 0
falsepositive = 0
truenegative = 0
truepositive = 0


# Specify the columns to compare
columns_to_compare = ['asthma', 'iron def', 'pneumoni', 'substance', 'fibrosis', 'malnutr']

# Iterate over each column
for column in columns_to_compare:
    marked_column = f'Marked - {column}'
    verdict_column = f'Verdict - {column}'
    comparison_column = f'Comparison - {column}'

    # Create a new 'Comparison' column and initialize it with zeros
    df[comparison_column] = ""
    
    
    # Iterate over all rows in the DataFrame
    for i in range(len(df)):
        # Check if both 'Marked' and 'Verdict' values are not blank
        if not pd.isna(df.at[i, marked_column]) and not pd.isna(df.at[i, verdict_column]):
            # Check if 'Marked' value matches 'Verdict' value
            if df.at[i, marked_column] == df.at[i, verdict_column] and pd.notna(df.at[i, marked_column]):
            #if df.at[i, marked_column] == df.at[i, verdict_column]:
                df.at[i, comparison_column] = 1
                
                if df.at[i, marked_column] == 'Negative' and df.at[i, verdict_column] == 'Negative':
                    truenegative += 1
                    #print(f"TRUE NEGATIVE for {column}: {df.at[i,'Part 7']} ")
                elif df.at[i, marked_column] == 'Positive' and df.at[i, verdict_column] == 'Positive':
                    truepositive += 1
                    #print(f"TRUE POSITIVE for {column}: {df.at[i,'Part 7']} ")
            #elif df.at[i, marked_column] == "" or df.at[i, verdict_column] == "":
             #   df.at[i, comparison_column] = ""
            else:
                df.at[i, comparison_column] = 0
                print(f"Mismatch in row {i}, column {column}: Marked - {df.at[i, marked_column]}, Verdict - {df.at[i, verdict_column]}, ") #{df.at[i,'Part 7']}
                correct -= 1
                if df.at[i, marked_column] == 'Positive' and df.at[i, verdict_column] == 'Negative':
                    falsenegative += 1
                elif df.at[i, marked_column] == 'Negative' and df.at[i, verdict_column] == 'Positive':
                    falsepositive += 1
                    print(f"FALSE POSITIVE for {column}: {df.at[i,'Part 7']} ")
                
                
                
                
                
# Save the updated DF to a new CSV file
output_csv_path = 'output_comparison.csv'
df.to_csv(output_csv_path, index=False)

# This file is then used to match the doctors' notes to the structured dataset records


#Accuracy
score = round(correct/totalcomparisons,4)

# Precision
precision = round(truepositive / (truepositive + falsepositive),4)

# Recall (Sensitivity)
recall = round(truepositive / (truepositive + falsenegative),4)

# F1 Score
f1 = round(2 * (precision * recall) / (precision + recall),4)

# Specificity
specificity = round(truenegative / (truenegative + falsepositive),4)


sum = falsenegative+falsepositive+truenegative+truepositive


print("Accuracy: " + str(score))
print("False Negatives: " + str(falsenegative))
print("False Positives: " + str(falsepositive))
print("True Negatives: " + str(truenegative))
print("True Positives: " + str(truepositive))
print(sum)
print("Precision:", precision)
print("Recall (Sensitivity):", recall)
print("F1 Score:", f1)
print("Specificity:", specificity)




# Confusion Matrix
#confusionmatrix = confusion_matrix([1] * truepositive + [0] * falsenegative, [1] * falsepositive + [0] * truenegative, labels=[1, 0])

#print("Confusion Matrix:")
#print(confusionmatrix)




Mismatch in row 8, column asthma: Marked - Positive, Verdict - Negative, 
Mismatch in row 19, column asthma: Marked - Positive, Verdict - Negative, 
Mismatch in row 30, column asthma: Marked - Positive, Verdict - Negative, 
Mismatch in row 37, column asthma: Marked - Positive, Verdict - Negative, 
Mismatch in row 38, column asthma: Marked - Positive, Verdict - Negative, 
Mismatch in row 289, column asthma: Marked - Positive, Verdict - Negative, 
Mismatch in row 291, column asthma: Marked - Negative, Verdict - Positive, 
FALSE POSITIVE for asthma: 3. She denies any history of hypertension, diabetes, or asthma. MEDICATIONS: 1. Depakote 500 mg p.o. q.a.m. 2. Paxil CR 20 mg p.o. q.a.m. 3. Seroquel 25 mg two tablets p.o. q.h.s. 4. Seroquel 25 mg p.o. p.r.n. 5. Neurontin 600 mg p.o. q.h.s. ALLERGIES: The patient is allergic to penicillin which causes a rash. 
Mismatch in row 304, column asthma: Marked - Negative, Verdict - Positive, 
FALSE POSITIVE for asthma: The prenatal course has been ot

Output_comparison.csv was then used to match the doctors' notes to the structured dataset records