In [1]:
import pandas as pd

# Load the dataset from CSV
data = pd.read_csv("mtsamples.csv")

# Display the first few rows of the dataset
print(data.head())


   Unnamed: 0                                        description  \
0           0   A 23-year-old white female presents with comp...   
1           1           Consult for laparoscopic gastric bypass.   
2           2           Consult for laparoscopic gastric bypass.   
3           3                             2-D M-Mode. Doppler.     
4           4                                 2-D Echocardiogram   

             medical_specialty                                sample_name  \
0         Allergy / Immunology                         Allergic Rhinitis    
1                   Bariatrics   Laparoscopic Gastric Bypass Consult - 2    
2                   Bariatrics   Laparoscopic Gastric Bypass Consult - 1    
3   Cardiovascular / Pulmonary                    2-D Echocardiogram - 1    
4   Cardiovascular / Pulmonary                    2-D Echocardiogram - 2    

                                       transcription  \
0  SUBJECTIVE:,  This 23-year-old white female pr...   
1  PAST MEDICAL 

In [5]:
def preprocess_text(text):
    # Check if the text is not NaN (missing value)
    if isinstance(text, str):
        # Lowercase the text
        text = text.lower()

        # Remove non-alphabetic characters
        text = re.sub(r"[^a-zA-Z]", " ", text)

        # Tokenize the text
        tokens = text.split()

        # Remove stopwords
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]

        # Lemmatize the tokens
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        # Join the preprocessed tokens
        preprocessed_text = " ".join(tokens)

        return preprocessed_text
    else:
        return ""

# Apply preprocessing to the 'transcription' column
data["preprocessed_transcription"] = data["transcription"].apply(preprocess_text)

# Display the preprocessed text
print(data["preprocessed_transcription"].head())

0    subjective year old white female present compl...
1    past medical history difficulty climbing stair...
2    history present illness seen abc today pleasan...
3    mode left atrial enlargement left atrial diame...
4    left ventricular cavity size wall thickness ap...
Name: preprocessed_transcription, dtype: object


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed text
tfidf_matrix = vectorizer.fit_transform(data["preprocessed_transcription"])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF matrix
print(tfidf_df.head())



    aa  aaaa   ab  abadeedleedlebadle  abandoned  abandonment  abated  abbott  \
0  0.0   0.0  0.0                 0.0        0.0          0.0     0.0     0.0   
1  0.0   0.0  0.0                 0.0        0.0          0.0     0.0     0.0   
2  0.0   0.0  0.0                 0.0        0.0          0.0     0.0     0.0   
3  0.0   0.0  0.0                 0.0        0.0          0.0     0.0     0.0   
4  0.0   0.0  0.0                 0.0        0.0          0.0     0.0     0.0   

   abbreviated       abc  ...  zuba  zumi  zung  zygoma  zygomatic  zyloprim  \
0          0.0  0.000000  ...   0.0   0.0   0.0     0.0        0.0       0.0   
1          0.0  0.000000  ...   0.0   0.0   0.0     0.0        0.0       0.0   
2          0.0  0.035463  ...   0.0   0.0   0.0     0.0        0.0       0.0   
3          0.0  0.000000  ...   0.0   0.0   0.0     0.0        0.0       0.0   
4          0.0  0.000000  ...   0.0   0.0   0.0     0.0        0.0       0.0   

   zymar  zyprexa    zyrtec  zyv

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, data["medical_specialty"], test_size=0.2, random_state=42)

# Initialize the SVM classifier
svm = SVC(kernel="linear")

# Train the classifier
svm.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = svm.predict(X_test)

# Evaluate the classifier
report = classification_report(y_test, y_pred)
print(report)


                                precision    recall  f1-score   support

          Allergy / Immunology       0.00      0.00      0.00         1
                       Autopsy       0.00      0.00      0.00         2
                    Bariatrics       0.00      0.00      0.00         3
    Cardiovascular / Pulmonary       0.23      0.30      0.26        69
                  Chiropractic       0.00      0.00      0.00         1
    Consult - History and Phy.       0.18      0.26      0.21       107
    Cosmetic / Plastic Surgery       0.00      0.00      0.00         4
                     Dentistry       0.00      0.00      0.00         8
                   Dermatology       0.00      0.00      0.00         3
          Diets and Nutritions       0.00      0.00      0.00         1
             Discharge Summary       0.18      0.24      0.20        21
          ENT - Otolaryngology       0.20      0.12      0.15        25
        Emergency Room Reports       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Find the best model

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, data["medical_specialty"], test_size=0.2, random_state=42)

# Initialize and train multiple classifiers
classifiers = {
    "SVM": SVC(kernel="linear"),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

results = {}  # Dictionary to store classification results

for name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict the labels for the test set
    y_pred = classifier.predict(X_test)

    # Evaluate the classifier
    report = classification_report(y_test, y_pred, output_dict=True)

    # Store the classification results
    results[name] = report

# Compare and select the best classifier based on a chosen evaluation metric
best_classifier = None
best_metric_value = 0.0

for name, result in results.items():
    metric_value = result['accuracy']  # Choose the evaluation metric (e.g., accuracy)

    if metric_value > best_metric_value:
        best_metric_value = metric_value
        best_classifier = name

# Print the results and the best classifier
for name, result in results.items():
    print(f"Classifier: {name}")
    print(classification_report(y_test, y_pred))
    print()

print(f"Best Classifier: {best_classifier}")


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier: SVM
                                precision    recall  f1-score   support

          Allergy / Immunology       0.00      0.00      0.00         1
                       Autopsy       0.00      0.00      0.00         2
                    Bariatrics       0.00      0.00      0.00         3
    Cardiovascular / Pulmonary       0.13      0.16      0.14        69
                  Chiropractic       0.00      0.00      0.00         1
    Consult - History and Phy.       0.09      0.12      0.11       107
    Cosmetic / Plastic Surgery       0.00      0.00      0.00         4
                     Dentistry       0.00      0.00      0.00         8
                   Dermatology       0.00      0.00      0.00         3
          Diets and Nutritions       0.00      0.00      0.00         1
             Discharge Summary       0.12      0.14      0.13        21
          ENT - Otolaryngology       0.00      0.00      0.00        25
        Emergency Room Reports       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr