In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd

In [3]:
df = df[df['Location'] == 'GP']
df = df[['History', 'Vetting']]
df

Unnamed: 0,History,Vetting
0,"Upper abdo pain, contant, worse after certain ...",C
1,Recurrent upper abdo pain after eating. No bet...,C
2,"Upper abdo fullness with bloating, did have a ...",C
3,Recurrent episodes of abdominal pain ?cause ?g...,C
4,"recurrent raised GGT, obesity, ?fatty infiltra...",C
...,...,...
309,GB polyp 7mm. Check for growth. December,D
310,AML on the left. Check size please in 12 months,D
311,Eye melanoma. Needs abdomen check every 12 mon...,D
312,Aortic aneurism measured 55mm in 2022. Please ...,D


In [4]:
df['Vetting'].value_counts()

Vetting
C    80
A    80
D    70
Name: count, dtype: int64

In [5]:
vetting_map = {'A': 0, 'C': 1, 'D': 2 }
df['Vetting'] = df['Vetting'].map(vetting_map)
df

Unnamed: 0,History,Vetting
0,"Upper abdo pain, contant, worse after certain ...",1
1,Recurrent upper abdo pain after eating. No bet...,1
2,"Upper abdo fullness with bloating, did have a ...",1
3,Recurrent episodes of abdominal pain ?cause ?g...,1
4,"recurrent raised GGT, obesity, ?fatty infiltra...",1
...,...,...
309,GB polyp 7mm. Check for growth. December,2
310,AML on the left. Check size please in 12 months,2
311,Eye melanoma. Needs abdomen check every 12 mon...,2
312,Aortic aneurism measured 55mm in 2022. Please ...,2


In [6]:
df.reset_index(inplace=True)
df['Vetting'].value_counts()

Vetting
1    80
0    80
2    70
Name: count, dtype: int64

In [7]:
df['History'].str.split().str.len().max()

40

In [8]:
X = df['History'].to_list()
y = df['Vetting'].to_list()

### Basic NLP

In [10]:
import string
from nltk import word_tokenize
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer

tokens = [word_tokenize(sentence) for sentence in X]
lowercase_tokenized_sentences = [
    [token.lower() for token in tokens] for tokens in tokens
]

allowed_punct = ['<', '>', '/']
no_punct_tokens = [
    [word for word in sentence if word in allowed_punct or word not in string.punctuation]
    for sentence in lowercase_tokenized_sentences
]

dict_map = {
    'fhx': 'fh',
    ('ph', 'previous history', 'hx', 'pmh'): 'history',
    'family history': 'fh',
    'fx': 'fh',
    'right upper quadrant': 'ruq',
    'left upper quadrant': 'luq',
    ('abd', 'abdo', 'upper abdomen', 'epigastric', 'upper abdo', 'epigastric', 'hypochondrial', 'epigastrium', 'hypochondrium'): 'abdomen',
    'gb': 'gallbladder',
    'c/o': 'complaining of',
    ('uss', 'us', 'sonogram'): 'ultrasound',
    ('ca', 'malignancy', 'tumor', 'mass', 'lump'): 'cancer',
    ('+', '++', '+++', 'plus plus'): 'severe',
    'hepatic': 'liver',
    ('fatty infiltration', 'nafld'): 'fatty liver',
    'xs': 'excess',
    'rundown': 'tired',
    ('symps', 'sx'): 'symptoms',
    ('nad', 'nil of note', 'nil'): 'normal',
    'jan': 'january',
    'feb': 'february',
    'mar': 'march',
    'jun': 'june',
    'jul': 'july',
    'aug': 'august',
    ('sept', 'sep'): 'september',
    'oct': 'october',
    'nov': 'november',
    'dec': 'december',
    'tro': 'to rule out',
    'pt': 'patient',
    ('rt', 'r'): 'right',
    ('lt', 'l'): 'left',
    'x': 'times',
    ('uss', 'us', 'sonography', 'sonogram'): 'ultrasound',
}

medical_dict_path = "C:/Users/LPA/Desktop/Auto-vetting/wordlist.txt"
spell = SpellChecker(distance=3)
spell.word_frequency.load_text_file(medical_dict_path)


def find_synonym(word):
    for key, value in dict_map.items():
        if isinstance(key, str):
            if word == key:
                return value
        elif isinstance(key, tuple):
            if word in key:
                return value
    return word


corrected_text = []
for sentence_tokens in no_punct_tokens:
    corrected_sentence = []
    for word in sentence_tokens:
        # Replace synonyms
        updated_word = find_synonym(word.lower())
        
        # Spell check
        if updated_word not in spell:
            suggestions = spell.candidates(updated_word)
            if suggestions:
                corrected_word = list(suggestions)[0]
                corrected_sentence.append(corrected_word)
            else:
                corrected_sentence.append(updated_word)
        else:
            corrected_sentence.append(updated_word)

    corrected_text.append(corrected_sentence)

corrected_sentences = [' '.join(words) for words in corrected_text]

lemmatizer = WordNetLemmatizer()
final_sentences = [
    ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(sentence)])
    for sentence in corrected_sentences
]

In [21]:
df = pd.DataFrame(final_sentences, columns=['Sentence'])
df['y'] = y
df

Unnamed: 0,Sentence,y
0,upper abdomen pain content worse after certain...,1
1,recurrent upper abdomen pain after eating no b...,1
2,upper abdomen fullness with bloating did have ...,1
3,recurrent episode of abdominal pain cause gall...,1
4,recurrent raised ggt obesity fatty infiltration,1
...,...,...
225,gallbladder polyp umm check for growth december,2
226,aml on the left check size please in 12 month,2
227,eye melanoma need abdomen check every 12 month...,2
228,aortic aneurism measured umm in 2022 please re...,2
