In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [17]:
# Step 1: Data Preprocessing
data = pd.read_csv('mtsamples.csv')
data = data[['transcription', 'medical_specialty', 'sample_name']]
data.dropna(inplace=True)

X = data['transcription']
y_subspecialty = data['medical_specialty']
y_class_label = data['sample_name']

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train_subspecialty, y_test_subspecialty, y_train_class_label, y_test_class_label = train_test_split(
    X, y_subspecialty, y_class_label, test_size=0.3, random_state=40
)


In [19]:
print(y_test_class_label)

1013                    Coarctation of Aorta 
434      Phacoemulsification Of Cataract - 2 
460                        Pelvic Laparotomy 
120                              HPV Consult 
1498                       Ultrasound OB - 1 
                        ...                  
3556          Esophagogastroduodenoscopy - 4 
2318               Arthroscopy Shoulder/Knee 
567             Lymph Node Excisional Biopsy 
3732               Nasolabial Fold Elevation 
1542         Nuclear Medicine Lymphatic Scan 
Name: sample_name, Length: 1490, dtype: object


In [20]:
# Step 2: Feature Extraction
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [21]:
# Model Selection and Fine-tuning
# For Subspecialty of Medicine Classification

# Model Selection using Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_param_grid = {'alpha': [0.1, 0.5, 1.0]}
nb_grid_search = GridSearchCV(nb_model, nb_param_grid, cv=5)
nb_grid_search.fit(X_train_vectorized, y_train_subspecialty)

In [22]:
# Get the best model and its evaluation results
best_nb_model = nb_grid_search.best_estimator_
best_nb_model.fit(X_train_vectorized, y_train_subspecialty)
y_pred_subspecialty_nb = best_nb_model.predict(X_test_vectorized)
report_subspecialty_nb = classification_report(y_test_subspecialty, y_pred_subspecialty_nb, zero_division=0)

In [23]:
print("Subspecialty of Medicine Classification Report using Multinomial Naive Bayes:")
print(report_subspecialty_nb)

Subspecialty of Medicine Classification Report using Multinomial Naive Bayes:
                                precision    recall  f1-score   support

          Allergy / Immunology       0.00      0.00      0.00         2
                       Autopsy       0.00      0.00      0.00         1
                    Bariatrics       0.00      0.00      0.00         8
    Cardiovascular / Pulmonary       0.53      0.15      0.23       123
                  Chiropractic       0.00      0.00      0.00         6
    Consult - History and Phy.       0.24      0.99      0.38       147
    Cosmetic / Plastic Surgery       0.00      0.00      0.00         8
                     Dentistry       0.00      0.00      0.00        10
                   Dermatology       0.00      0.00      0.00         8
          Diets and Nutritions       0.00      0.00      0.00         3
             Discharge Summary       0.00      0.00      0.00        39
          ENT - Otolaryngology       0.00      0.00      

KeyboardInterrupt: 