# Imports

In [None]:
!pip install pyspark
!pip install sparknlp



In [None]:
!git clone https://github.com/sebischair/Medical-Abstracts-TC-Corpus.git

fatal: destination path 'Medical-Abstracts-TC-Corpus' already exists and is not an empty directory.


# Load Data

In [None]:
import sparknlp
spark = sparknlp.start(gpu=True)

In [None]:
# # path to data folder
train_path = "Medical-Abstracts-TC-Corpus/medical_tc_train.csv"
test_path = "Medical-Abstracts-TC-Corpus/medical_tc_test.csv"

# data load
df_train = spark.read.csv( train_path , header=True, inferSchema=True)
df_test = spark.read.csv( test_path , header=True, inferSchema=True)

In [None]:
df_train

DataFrame[condition_label: int, medical_abstract: string]

In [None]:
df_train.show()

+---------------+--------------------+
|condition_label|    medical_abstract|
+---------------+--------------------+
|              5|Tissue changes ar...|
|              1|Neuropeptide Y an...|
|              2|Sexually transmit...|
|              1|Lipolytic factors...|
|              3|Does carotid rest...|
|              3|The shoulder in m...|
|              2|The management of...|
|              4|Pharmacomechanica...|
|              5|Color Doppler dia...|
|              5|Noninvasive diagn...|
|              4|Sodium sensitive ...|
|              1|Imaging bone tumo...|
|              5|Closure of a bron...|
|              1|Utility of frozen...|
|              4|Antihypertensive ...|
|              2|Gallbladder perfo...|
|              1|Left ventricular ...|
|              4|Tongue ischemia f...|
|              5|In vitro and in v...|
|              1|Chondrosarcoma of...|
+---------------+--------------------+
only showing top 20 rows



In [None]:
df_test.show()

+---------------+--------------------+
|condition_label|    medical_abstract|
+---------------+--------------------+
|              3|Obstructive sleep...|
|              5|Neutrophil functi...|
|              5|A phase II study ...|
|              1|Flow cytometric D...|
|              4|Paraneoplastic va...|
|              1|Treatment of chil...|
|              1|Expression of maj...|
|              1|Questionable role...|
|              5|Reversibility of ...|
|              2|Current status of...|
|              5|The importance of...|
|              1|Human papillomavi...|
|              5|Gentamicin iontop...|
|              1|Repeat hepatic re...|
|              5|Evidence for intr...|
|              5|Glutamic acid and...|
|              5|A useful techniqu...|
|              5|The natural histo...|
|              3|Hereditary intern...|
|              5|Immune response o...|
+---------------+--------------------+
only showing top 20 rows



# Remove stopwords

In [None]:

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your",
            "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her",
            "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs",
            "themselves", "what", "which", "who", "whom", "this", "that", "these", "those",
            "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
            "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
            "or","because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
            "against", "between", "into", "through", "during", "before", "after", "above",
            "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
            "again", "further", "then", "once", "here", "there", "when", "where", "why",
            "how", "all", "any", "both", "each", "few", "more", "most", "other", "some",
            "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very",
            "s", "t", "can", "will", "just", "don", "should", "now"]

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

remove_stopwords_udf = udf(remove_stopwords, StringType())

df_train = df_train.withColumn("medical_abstract", remove_stopwords_udf("medical_abstract"))

In [None]:
df_train.show()

+---------------+--------------------+
|condition_label|    medical_abstract|
+---------------+--------------------+
|              5|Tissue changes ar...|
|              1|Neuropeptide Y ne...|
|              2|Sexually transmit...|
|              1|Lipolytic factors...|
|              3|carotid restenosi...|
|              3|shoulder multiple...|
|              2|management postop...|
|              4|Pharmacomechanica...|
|              5|Color Doppler dia...|
|              5|Noninvasive diagn...|
|              4|Sodium sensitive ...|
|              1|Imaging bone tumo...|
|              5|Closure bronchopl...|
|              1|Utility frozen-se...|
|              4|Antihypertensive ...|
|              2|Gallbladder perfo...|
|              1|Left ventricular ...|
|              4|Tongue ischemia s...|
|              5|vitro vivo effect...|
|              1|Chondrosarcoma ja...|
+---------------+--------------------+
only showing top 20 rows



# Word Embeddings

In [None]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline
from sparknlp.annotator import Tokenizer,WordEmbeddingsModel


In [None]:
# Step 1: Transforms raw texts to `document` annotation
documentAssembler = DocumentAssembler() \
      .setInputCol('medical_abstract') \
      .setOutputCol('document')

# Step 2: Tokenization
tokenizer = Tokenizer() \
      .setInputCols(['document']) \
      .setOutputCol('token')

# Step 3: Generate the Embeddings
embeddings = WordEmbeddingsModel\
      .pretrained('glove_100d', 'en')\
      .setInputCols(["token", "document"])\
      .setOutputCol("embeddings")

# Define the pipeline
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        embeddings])


# Fit the dataframe to the pipeline and then transform to produce the embeddings
model = pipeline.fit(df_train)

result = model.transform(df_train)

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
result

DataFrame[condition_label: int, medical_abstract: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [None]:
result.select("embeddings").embeddings

Column<'embeddings'>

In [None]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline, LightPipeline, EmbeddingsFinisher
from sparknlp.annotator import Tokenizer,WordEmbeddingsModel,SentenceEmbeddings
import pyspark.sql.functions as F


def my_word_embeddings(df, method):
  document_assembler = DocumentAssembler() \
        .setInputCol("medical_abstract") \
        .setOutputCol("document")

  tokenizer = Tokenizer() \
        .setInputCols(["document"]) \
        .setOutputCol("token")
  glove_embeddings = WordEmbeddingsModel().pretrained() \
        .setInputCols(["document",'token'])\
        .setOutputCol("embeddings")\
        .setCaseSensitive(True)
  sentence_embeddings = SentenceEmbeddings() \
        .setInputCols(["document", "embeddings"]) \
        .setOutputCol("sentence_embeddings") \
        .setPoolingStrategy(method)

  embeddings_finisher = EmbeddingsFinisher() \
        .setInputCols(["sentence_embeddings"]) \
        .setOutputCols(["finished_sentence_embeddings"])
  pipeline = Pipeline(
      stages=[document_assembler,
              tokenizer,
              glove_embeddings,
              sentence_embeddings,
              embeddings_finisher])

  model = pipeline.fit(df)
  result = model.transform(df)
  return result

In [None]:
WE_avg_train = my_word_embeddings(df_train, "AVERAGE")
WE_avg_test = my_word_embeddings(df_test, "AVERAGE")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
WE_sum_train = my_word_embeddings(df_train, "SUM")
WE_sum_test = my_word_embeddings(df_test, "SUM")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, ArrayType, DoubleType

import pyspark.sql.functions as F
import pyspark.sql.types as T

# WE_avg_train = WE_avg_train.withColumn("condition_label",F.array(F.col("condition_label").cast(F.StringType())))
# WE_avg_test = WE_avg_test.withColumn("condition_label",F.array(F.col("condition_label").cast(F.StringType())))

# WE_sum_train = WE_sum_train.withColumn("condition_label",F.array(F.col("condition_label").cast(F.StringType())))
# WE_sum_test = WE_sum_test.withColumn("condition_label",F.array(F.col("condition_label").cast(F.StringType())))

In [None]:
WE_avg_train = WE_avg_train.withColumn("condition_label",F.col("condition_label").cast("double"))
WE_avg_test = WE_avg_test.withColumn("condition_label",F.col("condition_label").cast("double"))

WE_sum_train = WE_sum_train.withColumn("condition_label",F.col("condition_label").cast("double"))
WE_sum_test = WE_sum_test.withColumn("condition_label",F.col("condition_label").cast("double"))

In [None]:
WE_avg_train

DataFrame[condition_label: double, medical_abstract: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, sentence_embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, finished_sentence_embeddings: array<array<float>>]

In [None]:
WE_sum_train.selectExpr("explode(finished_sentence_embeddings) as finished_sentence_embeddings").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Bert

In [None]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector, BertSentenceEmbeddings
from pyspark.ml import Pipeline
import pyspark.sql.functions as F
documentAssembler = DocumentAssembler() \
    .setInputCol("medical_abstract") \
    .setOutputCol("document")
sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")
embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128") \
    .setInputCols(["sentence"]) \
    .setOutputCol("sentence_embeddings")\
    .setCaseSensitive(True) \
    .setMaxSentenceLength(512) \

pipeline = Pipeline(stages=[documentAssembler,
                            sentence,
                            embeddings])

sent_small_bert_L2_128 download started this may take some time.
Approximate size to download 16.1 MB
[OK!]


In [None]:
model = pipeline.fit(df_train)
BERT_train = model.transform(df_train)
BERT_test = model.transform(df_test)

In [None]:
# BERT_train = BERT_train.withColumn("condition_label",F.array(F.col("condition_label").cast(F.StringType())))
# BERT_test = BERT_test.withColumn("condition_label",F.array(F.col("condition_label").cast(F.StringType())))

In [None]:
BERT_test

DataFrame[condition_label: int, medical_abstract: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, sentence: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, sentence_embeddings: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>]

In [None]:
result_df = BERT_test.select(F.explode(F.arrays_zip
                        (BERT_test.sentence.result,
                         BERT_test.sentence_embeddings.embeddings)).alias("cols")) \
                  .select(F.expr("cols['0']").alias("sentence"),
                          F.expr("cols['1']").alias("Bert_sentence_embeddings"))
result_df.show(truncate=150)

+------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                              sentence|                                                                                                                              Bert_sentence_embeddings|
+------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------+
|                                                                   Obstructive sleep ap

# Models


## Neural Network

In [None]:
from sklearn.metrics import classification_report

def evaluate(model, dataset,name, ev_name):
  dataset = model.transform(dataset)
  dataset = dataset.select('condition_label',"class.result").toPandas()
  dataset['result'] = dataset['result'].apply(lambda x : x[0])
  dataset['condition_label'] = dataset['condition_label'].astype(str)
  print(name)
  print(ev_name)
  print(classification_report(dataset['condition_label'], dataset['result']))


In [None]:
from sparknlp.annotator import ClassifierDLApproach


classsifierdl = ClassifierDLApproach()\
                  .setInputCols(["sentence_embeddings"])\
                  .setOutputCol("class")\
                  .setLabelColumn("condition_label")\
                  .setBatchSize(64)\
                  .setMaxEpochs(20)\
                  .setLr(0.005)\
                  .setDropout(0.5)

                  # .setLr(0.01) \
                  # .setBatchSize(64)\
                  # .setMaxEpochs(50)\





In [None]:
NN_WE_avg_model = classsifierdl.fit(WE_avg_train)

In [None]:
evaluate(NN_WE_avg_model, WE_avg_train,"Neural Nework Word Embeddings(AVG)", "Training Set")

Neural Nework Word Embeddings(AVG)
Training Set


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.67      0.76      0.71      2530
         2.0       0.00      0.00      0.00      1195
         3.0       0.00      0.00      0.00      1540
         4.0       0.59      0.81      0.68      2441
         5.0       0.46      0.63      0.53      3844

    accuracy                           0.55     11550
   macro avg       0.34      0.44      0.38     11550
weighted avg       0.42      0.55      0.48     11550



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evaluate(NN_WE_avg_model, WE_avg_test,"Neural Nework Word Embeddings(AVG)", "Test Set")

Neural Nework Word Embeddings(AVG)
Test Set
              precision    recall  f1-score   support

         1.0       0.77      0.16      0.27       633
         2.0       0.00      0.00      0.00       299
         3.0       0.00      0.00      0.00       385
         4.0       0.43      0.88      0.58       610
         5.0       0.36      0.57      0.44       961

    accuracy                           0.41      2888
   macro avg       0.31      0.32      0.26      2888
weighted avg       0.38      0.41      0.33      2888



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
NN_WE_sum_model = classsifierdl.fit(WE_sum_train)

In [None]:
evaluate(NN_WE_sum_model, WE_sum_train,"Neural Nework Word Embeddings(SUM)", "Training Set")

Neural Nework Word Embeddings(SUM)
Training Set


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      2530
         2.0       0.00      0.00      0.00      1195
         3.0       0.00      0.00      0.00      1540
         4.0       0.00      0.00      0.00      2441
         5.0       0.33      1.00      0.50      3844

    accuracy                           0.33     11550
   macro avg       0.07      0.20      0.10     11550
weighted avg       0.11      0.33      0.17     11550



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evaluate(NN_WE_sum_model, WE_sum_test,"Neural Nework Word Embeddings(SUM)", "Test Set")

Neural Nework Word Embeddings(SUM)
Test Set
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       633
         2.0       0.00      0.00      0.00       299
         3.0       0.00      0.00      0.00       385
         4.0       0.00      0.00      0.00       610
         5.0       0.33      1.00      0.50       961

    accuracy                           0.33      2888
   macro avg       0.07      0.20      0.10      2888
weighted avg       0.11      0.33      0.17      2888



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
NN_BERT_model = classsifierdl.fit(BERT_train)

In [None]:
evaluate(NN_BERT_model, BERT_train,"Neural Nework BERT", "Training Set")

Neural Nework BERT
Training Set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.63      0.72      0.67      2530
           2       0.00      0.00      0.00      1195
           3       0.00      0.00      0.00      1540
           4       0.66      0.62      0.64      2441
           5       0.46      0.76      0.57      3844

    accuracy                           0.54     11550
   macro avg       0.35      0.42      0.38     11550
weighted avg       0.43      0.54      0.47     11550



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
evaluate(NN_BERT_model, BERT_test,"Neural Nework BERT", "Test Set")

Neural Nework BERT
Test Set
              precision    recall  f1-score   support

           1       0.53      0.71      0.61       633
           2       0.00      0.00      0.00       299
           3       0.00      0.00      0.00       385
           4       0.51      0.56      0.53       610
           5       0.39      0.55      0.45       961

    accuracy                           0.46      2888
   macro avg       0.29      0.36      0.32      2888
weighted avg       0.35      0.46      0.40      2888



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [None]:
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType

# Create a LinearSVC classifier
svm = LinearSVC(
    maxIter=50,
    regParam=0.1,
    labelCol="label",
    featuresCol="features"
)




In [None]:
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="condition_label", outputCol="label")

In [None]:
def df_prepare(df):
  # Flatten the nested array
  df_flattened = df.withColumn(
      "flattened_sentence_embeddings",
      F.flatten(F.col("finished_sentence_embeddings"))
  )

  df_flattened = stringIndexer.fit(df_flattened).transform(df_flattened)

  # Convert the flattened array into a vector
  to_vector_udf = F.udf(lambda a: Vectors.dense(a), VectorUDT())
  df_flattened = df_flattened.withColumn(
      "features",
      to_vector_udf(F.col("flattened_sentence_embeddings"))
  )
  return df_flattened



# Create a One-vs-Rest classifier
ovr = OneVsRest(
    classifier=svm,
)

# # Fit the One-vs-Rest classifier on the flattened DataFrame
# model = ovr.fit(df_prepare(we_avg_train))

# # Make predictions on the dataset
# predictions = model.transform(we_avg_train_flattened)

# # Evaluate the model using a multi-class classification evaluator
# evaluator = MulticlassClassificationEvaluator(
#     labelCol="condition_label",
#     metricName="accuracy"
# )
# accuracy = evaluator.evaluate(predictions)

# # Print the accuracy
# print("Accuracy:", accuracy)

In [None]:
def svm_evaluate(model, dataset,name, ev_name):
  dataset = model.transform(df_prepare(dataset))
  dataset = dataset.select('label',"Prediction").toPandas()
  # dataset['label'] = dataset['label'].astype(str)
  # dataset['Prediction'] = dataset['Prediction'].astype(str)
  print(name)
  print(ev_name)
  print(classification_report(dataset['label'], dataset['Prediction']))

In [None]:
SVM_WE_avg_model = ovr.fit(df_prepare(WE_avg_train))

In [None]:
svm_evaluate(SVM_WE_avg_model, WE_avg_train,"SVM Word Embeddings(AVG)", "Training Set")

SVM Word Embeddings(AVG)
Training Set
              precision    recall  f1-score   support

         0.0       0.55      0.25      0.35      3844
         1.0       0.54      0.88      0.67      2530
         2.0       0.51      0.86      0.64      2441
         3.0       0.52      0.32      0.39      1540
         4.0       0.50      0.26      0.34      1195

    accuracy                           0.53     11550
   macro avg       0.52      0.51      0.48     11550
weighted avg       0.53      0.53      0.49     11550



In [None]:
svm_evaluate(SVM_WE_avg_model, WE_avg_test,"SVM Word Embeddings(AVG)", "Test Set")

SVM Word Embeddings(AVG)
Test Set
              precision    recall  f1-score   support

         0.0       0.50      0.11      0.17       961
         1.0       0.71      0.60      0.65       633
         2.0       0.35      0.95      0.52       610
         3.0       0.55      0.14      0.22       385
         4.0       0.25      0.34      0.29       299

    accuracy                           0.42      2888
   macro avg       0.47      0.43      0.37      2888
weighted avg       0.49      0.42      0.37      2888



In [None]:
SVM_WE_sum_model = ovr.fit(df_prepare(WE_sum_train))

In [None]:
svm_evaluate(SVM_WE_sum_model, WE_sum_train,"SVM Word Embeddings(SUM)", "Training Set")


SVM Word Embeddings(SUM)
Training Set
              precision    recall  f1-score   support

         0.0       0.54      0.27      0.36      3844
         1.0       0.53      0.88      0.67      2530
         2.0       0.52      0.85      0.65      2441
         3.0       0.51      0.34      0.41      1540
         4.0       0.50      0.19      0.28      1195

    accuracy                           0.53     11550
   macro avg       0.52      0.51      0.47     11550
weighted avg       0.53      0.53      0.49     11550



In [None]:
svm_evaluate(SVM_WE_sum_model, WE_sum_test,"SVM Word Embeddings(SUM)", "Test Set")


SVM Word Embeddings(SUM)
Test Set
              precision    recall  f1-score   support

         0.0       0.44      0.23      0.31       961
         1.0       0.71      0.50      0.59       633
         2.0       0.34      0.94      0.50       610
         3.0       0.67      0.11      0.19       385
         4.0       0.37      0.19      0.25       299

    accuracy                           0.42      2888
   macro avg       0.51      0.40      0.37      2888
weighted avg       0.50      0.42      0.39      2888



In [None]:
BERT_train_SVM = BERT_train.withColumn(
      "flattened_sentence_embeddings",
      F.flatten(F.col("sentence_embeddings.embeddings"))
  )

BERT_train_SVM = BERT_train_SVM.select(
    "condition_label",
    "medical_abstract",
    "document",
    "sentence",
    "sentence_embeddings",
    "flattened_sentence_embeddings",
    slice("flattened_sentence_embeddings", 1, 100).alias("features")
)

BERT_train_SVM = stringIndexer.fit(BERT_train_SVM).transform(BERT_train_SVM)

  # Convert the flattened array into a vector
to_vector_udf = F.udf(lambda a: Vectors.dense(a), VectorUDT())
BERT_train_SVM = BERT_train_SVM.withColumn(
      "features",
      to_vector_udf(F.col("features"))
)

In [None]:
# BERT_train_SVM = BERT_train_SVM.withColumn("label",
#       F.col("condition_label"))


In [None]:
SVM_BERT_model = ovr.fit(BERT_train_SVM)

In [None]:
from sklearn.metrics import classification_report
def svm_evaluate(model, dataset,name, ev_name):
  dataset = model.transform(dataset)
  dataset = dataset.select('label',"Prediction").toPandas()
  # dataset['label'] = dataset['label'].astype(str)
  # dataset['Prediction'] = dataset['Prediction'].astype(str)
  print(name)
  print(ev_name)
  print(classification_report(dataset['label'], dataset['Prediction']))

In [None]:
svm_evaluate(SVM_BERT_model, BERT_train_SVM,"SVM BERT", "Training Set")

SVM BERT
Training Set
              precision    recall  f1-score   support

         0.0       0.45      0.32      0.37      3844
         1.0       0.44      0.75      0.55      2530
         2.0       0.46      0.61      0.52      2441
         3.0       0.32      0.16      0.21      1540
         4.0       0.33      0.14      0.20      1195

    accuracy                           0.43     11550
   macro avg       0.40      0.39      0.37     11550
weighted avg       0.42      0.43      0.40     11550



In [None]:
BERT_test_SVM = BERT_test.withColumn(
      "flattened_sentence_embeddings",
      F.flatten(F.col("sentence_embeddings.embeddings"))
  )

BERT_test_SVM = BERT_test_SVM.select(
    "condition_label",
    "medical_abstract",
    "document",
    "sentence",
    "sentence_embeddings",
    "flattened_sentence_embeddings",
    slice("flattened_sentence_embeddings", 1, 100).alias("features")
)

BERT_test_SVM = stringIndexer.fit(BERT_test_SVM).transform(BERT_test_SVM)

  # Convert the flattened array into a vector
to_vector_udf = F.udf(lambda a: Vectors.dense(a), VectorUDT())
BERT_test_SVM = BERT_test_SVM.withColumn(
      "features",
      to_vector_udf(F.col("features"))
)

In [None]:
svm_evaluate(SVM_BERT_model, BERT_test_SVM,"SVM BERT", "Test Set")

SVM BERT
Test Set
              precision    recall  f1-score   support

         0.0       0.39      0.22      0.28       961
         1.0       0.39      0.79      0.52       633
         2.0       0.41      0.58      0.48       610
         3.0       0.31      0.13      0.18       385
         4.0       0.27      0.05      0.09       299

    accuracy                           0.39      2888
   macro avg       0.35      0.35      0.31      2888
weighted avg       0.37      0.39      0.34      2888

