In [108]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

In [109]:
#Download stopwords 
nltk.download('stopwords')
stop = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\T8633\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [110]:
# Load data
df = pd.read_csv("C:\\Users\\T8633\\Medical-Analysis\\data\\MTS_samples.csv")


In [111]:

#Convert transcription to string and drop blanks
df['transcription'] = df['transcription'].astype(str)
df = df[df['transcription'].str.strip() != ''].reset_index(drop=True)


In [112]:
# Normalize whitespace and remove invisible characters
def normalize_text(text):
    text = text.replace('\xa0', ' ').replace('\u200b', ' ')
    text = re.sub(r'\r\n|\r', '\n', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['transcription'] = df['transcription'].map(normalize_text)

In [113]:
# Basic cleaning
def basic_clean(text):
    text = text.lower()
    text = re.sub(r'\b(subjective|objective|assessment|plan|hpi|pmh|ros|pe):', ' ', text)
    text = re.sub(r'[^\x00-\x7f]', ' ', text)  # remove non-ASCII
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['transcription'].map(basic_clean)

In [114]:
 #Normalize spacing (split camelCase, digits from letters)
def normalize_spacing(text):
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)
    text = re.sub(r'(?<=\d)(?=[a-zA-Z])', ' ', text)
    return text

df['clean_text'] = df['clean_text'].map(normalize_spacing)

# Remove common medical headers and boilerplate phrases
df['clean_text'] = df['clean_text'].str.replace(
    r'\b(preoperative diagnosis|preoperative diagnoses|postoperative diagnosis|history of present illness|past medical history|description)\b[:]*', 
    '', 
    regex=True

)

# Remove duplicate consecutive words from clean_text BEFORE lemmatization
df['clean_text'] = df['clean_text'].str.replace(r'\b(\w+)( \1\b)+', r'\1', regex=True)

In [115]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Tokenize and lemmatize
def tokenize_lemmatize(text):
    doc = nlp(text)
    tokens = [
        tok.lemma_.lower().strip()
        for tok in doc
        if tok.is_alpha and tok.lemma_ not in stop and len(tok) > 2
    ]
    return " ".join(tokens)

df['nlp_text'] = df['clean_text'].map(tokenize_lemmatize)

# Remove duplicate consecutive words
df['nlp_text'] = df['nlp_text'].str.replace(r'\b(\w+)( \1\b)+', r'\1', regex=True)

In [118]:
def remove_repeated_phrases(text, max_ngram=5):
    words = text.split()
    seen = set()
    result = []
    i = 0
    while i < len(words):
        for n in range(max_ngram, 1, -1):  # Try longer phrases first
            if i + n <= len(words):
                phrase = ' '.join(words[i:i+n])
                if phrase in seen:
                    i += n
                    break
                seen.add(phrase)
        else:
            result.append(words[i])
            i += 1
    return ' '.join(result)

df['nlp_text'] = df['nlp_text'].map(lambda x: remove_repeated_phrases(x))

In [119]:
# Drop empty rows
df = df[df['nlp_text'].str.strip() != ''].reset_index(drop=True)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
X_tfidf = vectorizer.fit_transform(df['nlp_text'])

# Output shape
print("TF-IDF shape:", X_tfidf.shape)


TF-IDF shape: (4997, 2000)


In [120]:
df[['clean_text', 'nlp_text']].head(10)


Unnamed: 0,clean_text,nlp_text
0,", this 23-year-old white female presents with ...",year old white female present complaint allerg...
1,", he has difficulty climbing stairs, difficult...",difficulty climb stair difficulty airline seat...
2,", i have seen abc today. he is a very pleasan...",see abc today pleasant gentleman year old poun...
3,"2-d m-mode: , ,1. left atrial enlargement with...",mode leave atrial enlargement diameter normal ...
4,1. the left ventricular cavity size and wall t...,left ventricular cavity size wall thickness ap...
5,", morbid obesity., ,morbid obesity.,procedure...",morbid obesity laparoscopic antecolic antegast...
6,",1. deformity, right breast reconstruction.,2....",deformity right breast reconstruction excess s...
7,"2-d echocardiogram,multiple views of the heart...",echocardiogram multiple view heart great vesse...
8,", lipodystrophy of the abdomen and thighs.,, ...",lipodystrophy abdomen thigh suction assist lip...
9,",1. normal cardiac chambers size.,2. normal le...",normal cardiac chamber size normal leave ventr...
