In [51]:
import pandas as pd
import re, string

In [52]:
def preprocess_text(text: str, lowercase=True, punctuation_removal=False):
    if text is None:
        return ""
    if lowercase:
        text = text.lower()
    if punctuation_removal:
        text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [53]:
df = pd.read_csv("mtsamples.csv", index_col=0)
df.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [54]:
df.iloc[0]["description"]

' A 23-year-old white female presents with complaint of allergies.'

In [55]:
df.iloc[0]["transcription"]

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [56]:
df.iloc[1]["description"]

' Consult for laparoscopic gastric bypass.'

In [57]:
df.iloc[1]["transcription"]

'PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor.  He exercises three times a week at home and does cardio.  He has difficulty walking two blocks or five flights of stairs.  Difficulty with snoring.  He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling.  He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive surgery on his right hand 13 years ago.  ,SOCIAL HISTORY:, He is currently single.  He has about ten drinks a year.  He had smoked significantly up until several months ago.  He now smokes less than three cigarettes a day.,FAMILY HISTORY:, Heart disease in both grandfathers, grandmother with stroke, and a grandmother with diabetes.  Denies obesity and hypertension in other family members.,CURRENT MEDICATIONS:, None.,ALLERGIES:,  He is allergic to Penicillin.,MISCELLANEOUS/EATING HISTORY:, He has b

-------------------

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4999 entries, 0 to 4998
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   description        4999 non-null   object
 1   medical_specialty  4999 non-null   object
 2   sample_name        4999 non-null   object
 3   transcription      4966 non-null   object
 4   keywords           3931 non-null   object
dtypes: object(5)
memory usage: 234.3+ KB


Removing Nullities

In [60]:
df.dropna(subset=['transcription', 'description'], inplace=True)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4966 entries, 0 to 4998
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   description        4966 non-null   object
 1   medical_specialty  4966 non-null   object
 2   sample_name        4966 non-null   object
 3   transcription      4966 non-null   object
 4   keywords           3898 non-null   object
dtypes: object(5)
memory usage: 232.8+ KB


In [62]:
df['transcription'] = df['transcription'].apply(preprocess_text)
df['description'] = df['description'].apply(preprocess_text)
df.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,a 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"subjective:, this 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"past medical history:, he has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"history of present illness: , i have seen abc ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-d m-mode. doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-d m-mode: , ,1. left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-d echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. the left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


Splitting the Dataframe into 3 subsets Train, Test and Val with raito of 80, 10, 10

In [64]:
from sklearn.model_selection import train_test_split

In [69]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
len_t = len(train_df) + len(test_df) + len(val_df)
print(f"Train set size: {len(train_df)} | ratio : {len(train_df) / len_t}")
print(f"Validation set size: {len(val_df)} | ratio : {len(val_df) / len_t}")
print(f"Test set size: {len(test_df)} | ratio : {len(test_df) / len_t}")

Train set size: 3972 | ratio : 0.7998389045509464
Validation set size: 497 | ratio : 0.10008054772452678
Test set size: 497 | ratio : 0.10008054772452678


Saving the sets

In [70]:
import os
output_dir = 'preprocessed_data'
os.makedirs(output_dir, exist_ok=True)

train_df.to_csv(os.path.join(output_dir, 'train.csv'), index=False)
val_df.to_csv(os.path.join(output_dir, 'val.csv'), index=False)
test_df.to_csv(os.path.join(output_dir, 'test.csv'), index=False)

print(f"Data saved in {output_dir}")

Data saved in preprocessed_data


-------------

In [76]:
tr = pd.read_csv("preprocessed_data/train.csv")
tr.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,laparoscopic-assisted vaginal hysterectomy. ...,Obstetrics / Gynecology,Vaginal Hysterectomy - Laparoscopic-Assisted,"preoperative diagnoses,1. abnormal uterine bl...","obstetrics / gynecology, abnormal uterine blee..."
1,a 6-year-old male who is a former 27-week pre...,Neurology,Status Epilepticus,"chief complaint:, status epilepticus.,history...",
2,postpartum tubal ligation and removal of uppe...,Obstetrics / Gynecology,Tubal Ligation - Postpartum,"preoperative diagnoses:, multiparity requeste...","obstetrics / gynecology, sterilization, fallop..."
3,bilateral scrotal orchiectomy,Urology,Orchiectomy,"bilateral scrotal orchectomy,procedure:,: the...","urology, scrotum, hemostasis, marcaine, catgut..."
4,left little finger extensor tendon laceration...,Surgery,Extensor Tendon Repair,"preoperative diagnosis:, left little finger e...","surgery, extensor tendon laceration, bier bloc..."


In [72]:
te = pd.read_csv("preprocessed_data/test.csv")
te

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,trabeculectomy with mitomycin c - sample/temp...,Surgery,Trabeculectomy,"preoperative diagnosis:, open angle glaucoma o...","surgery, trabeculectomy, kelly-descemet punch,..."
1,"chest x-ray on admission, no acute finding, n...",Discharge Summary,Discharge Summary - COPD,"procedures:,1. chest x-ray on admission, no a...",
2,left lower extremity venous doppler ultrasound,Cardiovascular / Pulmonary,Lower Extremity Venous Doppler,left lower extremity venous doppler ultrasound...,"cardiovascular / pulmonary, popliteal veins, s..."
3,lumbar osteomyelitis and need for durable cen...,Pediatrics - Neonatal,Broviac Catheter Placement,"preoperative diagnoses:,1. lumbar osteomyelit...","pediatrics - neonatal, lumbar osteomyelitis, c..."
4,total vaginal hysterectomy. menometrorrhagi...,Discharge Summary,Hysterectomy - Discharge Summary - 1,"admission diagnoses:,1. menometrorrhagia.,2. ...","discharge summary, dysmenorrhea, uterine fibro..."
...,...,...,...,...,...
492,request for consultation to evaluate stomatit...,Consult - History and Phy.,Request For Consultation,"reason for consultation:, please evaluate stom...","consult - history and phy., stomatitis, nsaid,..."
493,well child - left lacrimal duct stenosis,Pediatrics - Neonatal,Well-Child Check - 1,"chief complaint:, well-child check.,history o...","pediatrics - neonatal, well-child check, drain..."
494,"radical resection of tumor of the scalp, exci...",Surgery,Resection of Tumor of Scalp,"preoperative diagnosis: , squamous cell carcin...","surgery, squamous cell carcinoma of the scalp,..."
495,closure of gastrostomy placed due to feeding ...,Surgery,Surgical Closure of Gastrostomy,"preoperative diagnosis: , gastrostomy (gastroc...","surgery, gastrocutaneous fistula, nurolon, clo..."


In [73]:
va = pd.read_csv("preprocessed_data/val.csv")
va

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,a woman presenting to our clinic for the firs...,Orthopedic,Bilateral Hip Pain,"history of present illness: ,the patient is a...","orthopedic, bilateral hip pain, femoroacetabul..."
1,patient with sudden onset dizziness and rue c...,Consult - History and Phy.,MCA Aneurysm,"hx: ,this 46y/o rhm with htn was well until 2 ...","consult - history and phy., mca aneurysm, rue ..."
2,placement of right new ventriculoperitoneal (...,Neurosurgery,VP Shunt Placement,"title of operation: , placement of right new v...","neurosurgery, ommaya reservoir, frontal, strat..."
3,visually significant nuclear sclerotic catara...,Surgery,Phacoemulsification & Lens Implantation - 1,"preoperative diagnosis:, visually significant...","surgery, retrobulbar block, posterior chamber ..."
4,this patient was seen in clinic for a school...,Pediatrics - Neonatal,School Physical - 1,"subjective:, this patient was seen in clinic ...","pediatrics - neonatal, school physical, calciu..."
...,...,...,...,...,...
492,fall with questionable associated loss of con...,Neurology,Epidural Hematoma,"cc:, fall with questionable associated loss of...","neurology, loss of consciousness, parietal epi..."
493,a routine return appointment for a 71-year-ol...,Cardiovascular / Pulmonary,Chronic Atrial Fibrillation,"reason for visit:, this is a routine return a...",
494,she progressed in labor throughout the day. ...,Surgery,Delivery Note - 4,"delivery note: , the patient came in around 03...","surgery, iupc, meconium, pitocin, epidural, re..."
495,normal physical exam template. well develope...,Consult - History and Phy.,Normal Physical Exam Template - 5,"general: , well developed, well nourished, ale...","consult - history and phy., respiratory, abdom..."
