In [129]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import random

In [130]:
SEED = random.randint(0, 9999)

In [131]:
df

Unnamed: 0,Sentence,y
0,upper abdomen pain content worse after certain...,1
1,recurrent upper abdomen pain after eating no b...,1
2,upper abdomen fullness with bloating did have ...,1
3,recurrent episode of abdominal pain cause gall...,1
4,recurrent raised ggt obesity fatty infiltration,1
...,...,...
225,gallbladder polyp umm check for growth december,2
226,aml on the left check size please in 12 month,2
227,eye melanoma need abdomen check every 12 month...,2
228,aortic aneurism measured umm in 2022 please re...,2


In [1]:
# nltk.download('stopwords')

In [133]:
from nltk.corpus import stopwords
sentences = df['Sentence'].to_list()

tokens = [nltk.word_tokenize(word) for word in sentences]

stop_words = set(stopwords.words('english'))
filtered_tokens = []
for sentence in tokens:
    filtered_word = [word for word in sentence if word not in stop_words]
    filtered_tokens.append(filtered_word)

non_stop = [' '.join(words) for words in filtered_tokens]

In [134]:
X_list = non_stop
y = df['y'].to_list()

### BOW + SVC

In [135]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_list)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'degree': [1, 2, 3, 4],
    'coef0': [0.0, 1.0],
    'shrinking': [True, False],
    'probability': [True, False],
}

grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_svc = SVC(**best_params)
best_svc.fit(X_train, y_train)

y_pred = best_svc.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Fitting 3 folds for each of 768 candidates, totalling 2304 fits
Best Hyperparameters: {'C': 0.1, 'coef0': 0.0, 'degree': 1, 'gamma': 'scale', 'kernel': 'linear', 'probability': True, 'shrinking': True}
Accuracy: 0.8529411764705882
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.73      0.85        15
           1       0.67      0.89      0.76         9
           2       0.91      1.00      0.95        10

    accuracy                           0.85        34
   macro avg       0.86      0.87      0.85        34
weighted avg       0.89      0.85      0.86        34



In [104]:
test_pred = best_svc.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

    Predicton  Label
0           1      1
1           2      2
2           2      1
3           1      1
4           0      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          1      1
11          2      2
12          1      1
13          1      1
14          0      0
15          1      1
16          2      2
17          2      2
18          2      2
19          1      0
20          2      2
21          1      0
22          2      2
23          1      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          1      0
0.8285714285714286


In [105]:
results = pd.DataFrame([{'Model': 'BOW + SVC', 'Accuracy': final_score*100}])
results = results.set_index('Model')

### BOW + RF

In [106]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_classifier = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Hyperparameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.8235294117647058
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.67      0.80        15
           1       0.62      0.89      0.73         9
           2       0.91      1.00      0.95        10

    accuracy                           0.82        34
   macro avg       0.84      0.85      0.83        34
weighted avg       0.87      0.82      0.83        34



In [107]:
test_pred = best_rf.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

results.loc['BOW + RF'] = [final_score*100]

    Predicton  Label
0           1      1
1           2      2
2           2      1
3           1      1
4           0      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          1      1
11          2      2
12          1      1
13          1      1
14          0      0
15          1      1
16          2      2
17          2      2
18          2      2
19          1      0
20          2      2
21          1      0
22          2      2
23          1      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          1      0
0.8285714285714286


### BOW + LogReg

In [108]:
from sklearn.linear_model import LogisticRegression

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

logreg_classifier = LogisticRegression()

grid_search = GridSearchCV(estimator=logreg_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_logreg = LogisticRegression(**best_params)
best_logreg.fit(X_train, y_train)

y_pred = best_logreg.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Hyperparameters: {'C': 0.1, 'max_iter': 200, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 0.8529411764705882
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.73      0.85        15
           1       0.67      0.89      0.76         9
           2       0.91      1.00      0.95        10

    accuracy                           0.85        34
   macro avg       0.86      0.87      0.85        34
weighted avg       0.89      0.85      0.86        34





In [109]:
test_pred = best_logreg.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

results.loc['BOW + LogReg'] = [final_score*100]

    Predicton  Label
0           1      1
1           2      2
2           1      1
3           1      1
4           0      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          1      1
11          2      2
12          1      1
13          1      1
14          0      0
15          1      1
16          2      2
17          2      2
18          2      2
19          1      0
20          2      2
21          1      0
22          2      2
23          1      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          1      0
0.8571428571428571


### BOW + XGBoost 

In [110]:
# pip install xgboost

In [111]:
from xgboost import XGBClassifier

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

xgb_classifier = XGBClassifier()

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_


best_xgb = XGBClassifier(**best_params)
best_xgb.fit(X_train, y_train)


y_pred = best_xgb.predict(X_val)


accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Best Hyperparameters: {'C': 0.1, 'max_iter': 200, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 0.7647058823529411
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.73      0.81        15
           1       0.54      0.78      0.64         9
           2       0.89      0.80      0.84        10

    accuracy                           0.76        34
   macro avg       0.78      0.77      0.76        34
weighted avg       0.81      0.76      0.78        34



Parameters: { "C", "max_iter", "penalty", "solver" } are not used.



In [112]:
test_pred = best_xgb.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

results.loc['BOW + XGBoost'] = [final_score*100]

    Predicton  Label
0           1      1
1           1      2
2           1      1
3           1      1
4           1      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          2      1
11          2      2
12          1      1
13          1      1
14          0      0
15          1      1
16          1      2
17          2      2
18          2      2
19          1      0
20          2      2
21          1      0
22          2      2
23          1      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          0      0
0.7714285714285715


In [113]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_X = tfidf_vectorizer.fit_transform(X_list)

X_train, X_temp, y_train, y_temp = train_test_split(tfidf_X, y, test_size=0.3, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED)

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4],
    'coef0': [0.0, 1.0],
    'shrinking': [True, False],
    'probability': [True, False],
}

grid_search = GridSearchCV(estimator=SVC(), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_svc = SVC(**best_params)
best_svc.fit(X_train, y_train)

y_pred = best_svc.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_report}")

Fitting 3 folds for each of 288 candidates, totalling 864 fits
Best Hyperparameters: {'C': 1, 'coef0': 0.0, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'probability': True, 'shrinking': True}
Accuracy: 0.7941176470588235
Classification Report:
<function classification_report at 0x000001D41A736950>


In [114]:
test_pred = best_svc.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

results.loc['TF-IDF + SVC'] = [final_score*100]

    Predicton  Label
0           1      1
1           2      2
2           2      1
3           1      1
4           0      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          1      1
11          2      2
12          1      1
13          1      1
14          1      0
15          1      1
16          2      2
17          2      2
18          2      2
19          1      0
20          2      2
21          1      0
22          2      2
23          2      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          1      0
0.8285714285714286


### TF-IDF + RF

In [115]:
from sklearn.ensemble import RandomForestClassifier
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_classifier = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_rf = RandomForestClassifier(**best_params)
best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Hyperparameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.7647058823529411
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.60      0.72        15
           1       0.54      0.78      0.64         9
           2       0.91      1.00      0.95        10

    accuracy                           0.76        34
   macro avg       0.78      0.79      0.77        34
weighted avg       0.81      0.76      0.77        34



In [116]:
test_pred = best_rf.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

results.loc['TF-IDF + RF'] = [final_score*100]

    Predicton  Label
0           1      1
1           2      2
2           2      1
3           1      1
4           0      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          1      1
11          2      2
12          1      1
13          1      1
14          0      0
15          1      1
16          2      2
17          2      2
18          2      2
19          1      0
20          2      2
21          1      0
22          2      2
23          1      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          1      0
0.8285714285714286


###  TF-IDF + LogReg

In [117]:
from sklearn.linear_model import LogisticRegression

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

logreg_classifier = LogisticRegression()

grid_search = GridSearchCV(estimator=logreg_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_logreg = LogisticRegression(**best_params)
best_logreg.fit(X_train, y_train)

y_pred = best_logreg.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.8235294117647058
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.67      0.80        15
           1       0.62      0.89      0.73         9
           2       0.91      1.00      0.95        10

    accuracy                           0.82        34
   macro avg       0.84      0.85      0.83        34
weighted avg       0.87      0.82      0.83        34



In [118]:
test_pred = best_logreg.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

results.loc['TF-IDF + LogReg'] = [final_score*100]

    Predicton  Label
0           1      1
1           2      2
2           2      1
3           1      1
4           0      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          1      1
11          2      2
12          1      1
13          1      1
14          1      0
15          1      1
16          2      2
17          2      2
18          2      2
19          1      0
20          2      2
21          1      0
22          2      2
23          1      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          1      0
0.8


### TF-IDF + XGBoost

In [119]:
from xgboost import XGBClassifier

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

xgb_classifier = XGBClassifier()

# grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)


# best_params = grid_search.best_params_


best_xgb = XGBClassifier(**best_params)
best_xgb.fit(X_train, y_train)


y_pred = best_xgb.predict(X_val)


accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Best Hyperparameters: {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.6470588235294118
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67        15
           1       0.43      0.67      0.52         9
           2       0.88      0.70      0.78        10

    accuracy                           0.65        34
   macro avg       0.68      0.66      0.66        34
weighted avg       0.70      0.65      0.66        34



Parameters: { "C", "max_iter", "penalty", "solver" } are not used.



In [120]:
test_pred = best_xgb.predict(X_test)
dataframe = pd.DataFrame({'Predicton' : test_pred, 'Label': y_test})
print(dataframe)
score = 0
for i in range(len(y_test)):
    if test_pred[i] == y_test[i]:
        score += 1
final_score = score / len(y_test)
print(final_score)

results.loc['TF-IDF + XGBoost'] = [final_score*100]

    Predicton  Label
0           1      1
1           1      2
2           1      1
3           1      1
4           0      0
5           2      2
6           2      2
7           2      2
8           2      2
9           0      0
10          1      1
11          2      2
12          1      1
13          1      1
14          0      0
15          1      1
16          2      2
17          2      2
18          2      2
19          0      0
20          2      2
21          1      0
22          2      2
23          1      2
24          1      1
25          1      1
26          1      0
27          0      0
28          2      2
29          1      1
30          0      0
31          2      2
32          1      1
33          0      0
34          1      0
0.8571428571428571


### Bert Medical

In [121]:
import torch
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification
import torch.nn.functional as F
from transformers import AutoTokenizer,  AutoModel


X_train, X_temp, y_train, y_temp = train_test_split(X_list, y, test_size=0.3, random_state=SEED)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=SEED)

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=40):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(self.texts[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': self.labels[idx]
        }
        return item

class TextClassificationDataModule(pl.LightningDataModule):
    def __init__(self, train_texts, train_labels, val_texts, val_labels, test_texts, test_labels, tokenizer, batch_size=16):
        super().__init__()
        self.train_texts = train_texts
        self.train_labels = train_labels
        self.val_texts = val_texts
        self.val_labels = val_labels
        self.test_texts = test_texts
        self.test_labels = test_labels
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = TextClassificationDataset(self.train_texts, self.train_labels, self.tokenizer)
        self.val_dataset = TextClassificationDataset(self.val_texts, self.val_labels, self.tokenizer)
        self.test_dataset = TextClassificationDataset(self.test_texts, self.test_labels, self.tokenizer)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)
    

class TextClassifier(pl.LightningModule):
    def __init__(self, model_name, num_classes, weight_decay=1e-5, dropout_prob=0.3):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
        self.weight_decay = weight_decay
        self.dropout = nn.Dropout(p=dropout_prob)
        
    def forward(self, input_ids, attention_mask):
        embeddings = self.model.get_input_embeddings()(input_ids)
        embeddings = self.dropout(embeddings)
        return self.model(input_ids, attention_mask=attention_mask).logits
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        logits = self(input_ids, attention_mask)
        loss = self.loss(logits, labels)
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        logits = self(input_ids, attention_mask)
        loss = self.loss(logits, labels)
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        logits = self(input_ids, attention_mask)
        loss = self.loss(logits, labels)
        self.log('test_loss', loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-5)

    def loss(self, logits, labels):
        return F.cross_entropy(logits, labels)

In [122]:
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
train_texts, train_labels = X_train, y_train  # Load your training data
val_texts, val_labels = X_val, y_val  # Load your validation data
test_texts, test_labels = X_test, y_test  # Load your test data

# Create DataModule
data_module = TextClassificationDataModule(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels, tokenizer)

# Create the model
model = TextClassifier("medicalai/ClinicalBERT", num_classes=3, weight_decay=1e-5, dropout_prob=0.3)

# Create the trainer
trainer = pl.Trainer(max_epochs=13, accelerator='cuda')

# Train the model
trainer.fit(model, data_module)

# Test the model
trainer.test(model, datamodule=data_module)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                                | Params
----------------------------------------------------------------
0 | model   | DistilBertForSequenceClassification | 135 M 
1 | dropout | Dropout                             | 0     
----------------------------------------------------------------
135 M     Trainable params
0         Non-trainable params
135 M     Total params
541.308   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=13` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.5570173859596252}]

In [123]:
from sklearn.metrics import f1_score

model.eval()
all_predictions = []
all_labels = []

for batch in data_module.test_dataloader():
    input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
    logits = model(input_ids, attention_mask)
    predicted_labels = torch.argmax(logits, dim=1)
    all_predictions.extend(predicted_labels.tolist())
    all_labels.extend(labels.tolist())

f1 = f1_score(all_labels, all_predictions, average='weighted')
print(f'Weighted F1 Score: {f1}')

wrong_idx = []
for i in range(len(all_predictions)):
    if all_predictions[i] != all_labels[i]:
        wrong_idx.append(i)

Weighted F1 Score: 0.6426090038993264


In [124]:
accuracy = ((len(all_labels) - len(wrong_idx)) / (len(all_labels))) * 100
results.loc['Bert Medical'] = [accuracy]

### Bert base

In [125]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_texts, train_labels = X_train, y_train  # Load your training data
val_texts, val_labels = X_val, y_val  # Load your validation data
test_texts, test_labels = X_test, y_test  # Load your test data

# Create DataModule
data_module = TextClassificationDataModule(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels, tokenizer)

# Create the model
model = TextClassifier("bert-base-uncased", num_classes=3, weight_decay=1e-5, dropout_prob=0.3)

# Create the trainer
trainer = pl.Trainer(max_epochs=13, accelerator='cuda')

# Train the model
trainer.fit(model, data_module)

# Test the model
trainer.test(model, datamodule=data_module)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type                          | Params
----------------------------------------------------------
0 | model   | BertForSequenceClassification | 109 M 
1 | dropout | Dropout                       | 0     
----------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.938   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=13` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.5785688161849976}]

In [126]:
model.eval()
all_predictions = []
all_labels = []

for batch in data_module.test_dataloader():
    input_ids, attention_mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
    logits = model(input_ids, attention_mask)
    predicted_labels = torch.argmax(logits, dim=1)
    all_predictions.extend(predicted_labels.tolist())
    all_labels.extend(labels.tolist())

f1 = f1_score(all_labels, all_predictions, average='weighted')
print(f'Weighted F1 Score: {f1}')

wrong_idx = []
for i in range(len(all_predictions)):
    if all_predictions[i] != all_labels[i]:
        wrong_idx.append(i)

Weighted F1 Score: 0.8179894179894179


In [127]:
accuracy = ((len(all_labels) - len(wrong_idx)) / (len(all_labels))) * 100
results.loc['Bert Base'] = [accuracy]

In [128]:
results

Unnamed: 0_level_0,Accuracy
Model,Unnamed: 1_level_1
BOW + SVC,82.857143
BOW + RF,82.857143
BOW + LogReg,85.714286
BOW + XGBoost,77.142857
TF-IDF + SVC,82.857143
TF-IDF + RF,82.857143
TF-IDF + LogReg,80.0
TF-IDF + XGBoost,85.714286
Bert Medical,68.571429
Bert Base,82.857143
