# Medical transcript classification [sklearn]
* Multiclass classification of medical transcript.
* Reference notebook: <https://www.kaggle.com/code/leekahwin/text-classification-using-n-gram-0-8-f1/notebook>
* Dataset: <https://www.kaggle.com/code/leekahwin/text-classification-using-n-gram-0-8-f1/input>

## Install necessary dependencies

In [14]:
!pip install nltk



## Import libraries

In [15]:
import os
import string
from typing import Iterable

import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer

import giskard
from giskard import Dataset, Model

## Download NLTK stopwords corpus

In [16]:
# Download list of english stopwords.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mykytaalekseiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Define constants

In [17]:
# Constants.
LABELS_LIST = [
    'Neurosurgery',
    'ENT - Otolaryngology',
    'Discharge Summary',
    'General Medicine',
    'Gastroenterology',
    'Neurology',
    'SOAP / Chart / Progress Notes',
    'Obstetrics / Gynecology',
    'Urology'
]

TEXT_COLUMN_NAME = "transcription"
TARGET_COLUMN_NAME = "medical_specialty"

RANDOM_SEED = 8888

# Paths.
PATH_DATA = os.path.join(".", "datasets", "medical_transcript_classification_dataset", "mtsamples.csv")

## Load data

In [18]:
def load_data() -> pd.DataFrame:
    """Load and initially preprocess data."""
    df = pd.read_csv(PATH_DATA)

    # Drop useless columns.
    df = df.drop(columns=['Unnamed: 0', "description", "sample_name", "keywords"])

    # Trim text.
    df = df.apply(lambda x: x.str.strip())

    # Filter samples by label.
    df = df[df[TARGET_COLUMN_NAME].isin(LABELS_LIST)]

    # Drop rows with no transcript.
    df = df[df[TEXT_COLUMN_NAME].notna()]

    return df

transcript_df = load_data()

## Train-test split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(transcript_df[[TEXT_COLUMN_NAME]], transcript_df[TARGET_COLUMN_NAME],
                                                    random_state=RANDOM_SEED)

## Wrap dataset with giskard

In [20]:
raw_data = pd.concat([X_test, y_test], axis=1)
wrapped_data = Dataset(raw_data,
                       name="medical_transcript_dataset",
                       target=TARGET_COLUMN_NAME,
                       column_types={TEXT_COLUMN_NAME: "text"})

## Define preprocessing steps

In [21]:
stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")

def preprocess_text(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess text."""
    # Lower.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: x.lower())

    # Remove punctuation.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

    # Tokenize.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: x.split())

    # Stem.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: [stemmer.stem(word) for word in x])

    # Remove stop-words.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(lambda x: ' '.join([word for word in x if word not in stop_words]))

    return df

text_preprocessor = FunctionTransformer(preprocess_text)

In [22]:
def adapt_vectorizer_input(df: pd.DataFrame) -> Iterable:
    """Adapt input for the vectorizers.

    The problem is that vectorizers accept iterable, not DataFrame, but Series. Thus, we need to ravel dataframe with text have input single dimension.
    Issue reference: https://stackoverflow.com/questions/50665240/valueerror-found-input-variables-with-inconsistent-numbers-of-samples-1-3185"""

    df = df.iloc[:, 0]
    return df

vectorizer_input_adapter = FunctionTransformer(adapt_vectorizer_input)

## Define final pipeline

In [23]:
pipeline = Pipeline(steps=[
    ("text_preprocessor", text_preprocessor),
    ("vectorizer_input_adapter", vectorizer_input_adapter),
    ("vectorizer", CountVectorizer(ngram_range=(1, 1))),
    ("estimator", RandomForestClassifier(random_state=RANDOM_SEED))
])

## Fit and test estimator

In [24]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

                               precision    recall  f1-score   support

            Discharge Summary       0.43      0.38      0.41        34
         ENT - Otolaryngology       1.00      0.59      0.74        27
             Gastroenterology       0.57      0.81      0.67        43
             General Medicine       0.46      0.61      0.52        69
                    Neurology       0.74      0.66      0.70        53
                 Neurosurgery       0.73      0.79      0.76        24
      Obstetrics / Gynecology       0.85      0.58      0.69        50
SOAP / Chart / Progress Notes       0.40      0.36      0.38        33
                      Urology       0.74      0.68      0.71        38

                     accuracy                           0.61       371
                    macro avg       0.66      0.61      0.62       371
                 weighted avg       0.64      0.61      0.62       371



## Wrap model with giskard

In [25]:
wrapped_model = Model(pipeline,
                      model_type="classification",
                      name="medical_transcript_classification",
                      feature_names=[TEXT_COLUMN_NAME],
                      classification_labels=pipeline.classes_)

# Validate wrapped model and data.
print(classification_report(y_test, pipeline.classes_[wrapped_model.predict(wrapped_data).raw_prediction]))

                               precision    recall  f1-score   support

            Discharge Summary       0.43      0.38      0.41        34
         ENT - Otolaryngology       1.00      0.59      0.74        27
             Gastroenterology       0.57      0.81      0.67        43
             General Medicine       0.46      0.61      0.52        69
                    Neurology       0.74      0.66      0.70        53
                 Neurosurgery       0.73      0.79      0.76        24
      Obstetrics / Gynecology       0.85      0.58      0.69        50
SOAP / Chart / Progress Notes       0.40      0.36      0.38        33
                      Urology       0.74      0.68      0.71        38

                     accuracy                           0.61       371
                    macro avg       0.66      0.61      0.62       371
                 weighted avg       0.64      0.61      0.62       371



## Perform model scan

In [26]:
scanning_results = giskard.scan(wrapped_model, wrapped_data)

Your model is successfully validated.
Running scan…
Running detector ModelBiasDetector… 15 issues detected.
Running detector TextPerturbationDetector… 0 issues detected.
Scan completed: 15 issues found.


In [27]:
display(scanning_results)