In [15]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    pass

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Training Pipeline for Question Classification using Pre-trained Model

## 1. Libraries and Dependencies


In [16]:
%pip install -q pandas numpy scikit-learn transformers datasets evaluate torch nltk

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, concatenate_datasets, Dataset
import evaluate
import torch

## 2. Data Gathering and Preparation

### 2.1 Load Dataset

#### 2.1.1 Generated Dataset

In [18]:
# Define base path to the dataset
base_path = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/data/prompt_classification"

# Load the generated dataset
generated_df = pd.read_csv(f'{base_path}/generated_all_dataset.csv')
generated_dataset = Dataset.from_pandas(generated_df)

generated_dataset

Dataset({
    features: ['Question', 'qtype'],
    num_rows: 8910
})

#### 2.1.2 MedQuad Dataset

In [19]:
# Load the MedQuad dataset
medquad_dataset = load_dataset('keivalya/MedQuad-MedicalQnADataset')

# Remove the 'Answer' column as it's not needed for classification
medquad_dataset = medquad_dataset.remove_columns(['Answer'])

medquad_dataset

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question'],
        num_rows: 16407
    })
})

### 2.2 Combine Datasets

In [20]:
from datasets import concatenate_datasets

combined_dataset = concatenate_datasets([generated_dataset, medquad_dataset['train']])
combined_dataset

Dataset({
    features: ['Question', 'qtype'],
    num_rows: 25317
})

### 2.3 Exploration

In [21]:
combined_dataset.unique('qtype')

['dont answer',
 'precaution',
 'general',
 'symptoms',
 'desc',
 'susceptibility',
 'exams and tests',
 'treatment',
 'prevention',
 'information',
 'frequency',
 'complications',
 'causes',
 'research',
 'outlook',
 'considerations',
 'inheritance',
 'stages',
 'genetic changes',
 'support groups']

### 2.4 Split Dataset

In [26]:
# Convert combined dataset to pandas DataFrame
df = combined_dataset.to_pandas()

# Split the data into training and validation sets
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert back to Dataset format
train_data = Dataset.from_pandas(train_df)
eval_data = Dataset.from_pandas(eval_df)
all_data = Dataset.from_pandas(df)

## 3. Model Preparation

### 3.1 Label Encoding and Tokenization

In [27]:
# Initialize Label Encoder
label_encoder = LabelEncoder()
label_encoder.fit(df['qtype'])

# Create label mappings
label2id = {label: idx for idx, label in enumerate(label_encoder.classes_)}
id2label = {idx: label for label, idx in label2id.items()}

# Load the tokenizer
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["Question"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization to the datasets
tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_eval = eval_data.map(tokenize_function, batched=True)
tokenized_all = all_data.map(tokenize_function, batched=True)

# Encode labels
tokenized_train = tokenized_train.map(lambda x: {'labels': label_encoder.transform(x['qtype'])}, batched=True)
tokenized_eval = tokenized_eval.map(lambda x: {'labels': label_encoder.transform(x['qtype'])}, batched=True)
tokenized_all = tokenized_all.map(lambda x: {'labels': label_encoder.transform(x['qtype'])}, batched=True)

# Set the format for PyTorch
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_eval.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_all.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/20253 [00:00<?, ? examples/s]

Map:   0%|          | 0/5064 [00:00<?, ? examples/s]

Map:   0%|          | 0/25317 [00:00<?, ? examples/s]

Map:   0%|          | 0/20253 [00:00<?, ? examples/s]

Map:   0%|          | 0/5064 [00:00<?, ? examples/s]

Map:   0%|          | 0/25317 [00:00<?, ? examples/s]

In [None]:
# save label encoder
import joblib

joblib.dump(label_encoder, f'{base_path_model}/label_encoder.joblib')

['/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/label_encoder.joblib']

## 4. Model Initialization


In [None]:
# Load the pretrained model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_encoder.classes_),
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5. Model Training

### 5.1 Training Arguments

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16, # 64, 16
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)





### 5.2 Define Compute Metrics Function

In [None]:
# Load evaluation metric
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metric.compute(predictions=predictions, references=labels, average='macro')
    return {'f1': f1['f1']}

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

### 5.3 Initialize Trainer

In [None]:
# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    compute_metrics=compute_metrics,
)

In [None]:
# fdsafdsaf

### 5.4 Start Training

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 Â·Â·Â·Â·Â·Â·Â·Â·Â·Â·


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,F1
1,0.0215,0.017207,0.984771
2,0.0014,0.014861,0.98702
3,0.0006,0.007781,0.996138


TrainOutput(global_step=2691, training_loss=0.14020369367343738, metrics={'train_runtime': 4075.6351, 'train_samples_per_second': 10.553, 'train_steps_per_second': 0.66, 'total_flos': 1.1318498542669824e+16, 'train_loss': 0.14020369367343738, 'epoch': 3.0})

## 6. Model Evaluation

### 6.1 Evaluate on Validation Set

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Display the F1 score
print(f"Validation F1 Score: {eval_results['eval_f1']:.5f}")

Validation F1 Score: 0.99614


### 6.2 Detailed Classification Report

In [None]:
label_encoder.classes_

array(['causes', 'complications', 'considerations', 'desc', 'dont answer',
       'exams and tests', 'frequency', 'general', 'genetic changes',
       'information', 'inheritance', 'outlook', 'precaution',
       'prevention', 'research', 'stages', 'support groups',
       'susceptibility', 'symptoms', 'treatment'], dtype=object)

In [None]:
from sklearn.metrics import classification_report
import numpy as np

predictions, labels, _ = trainer.predict(tokenized_eval)
predicted_classes = np.argmax(predictions, axis=1)
label_list = list(range(len(label_encoder.classes_)))
report = classification_report(
    labels,
    predicted_classes,
    target_names=label_encoder.classes_,
    labels=label_list,
    digits=5
    )

print(report)

                 precision    recall  f1-score   support

         causes    1.00000   1.00000   1.00000       159
  complications    1.00000   1.00000   1.00000         6
 considerations    1.00000   1.00000   1.00000        48
           desc    1.00000   1.00000   1.00000       128
    dont answer    0.00000   0.00000   0.00000         0
exams and tests    0.99153   1.00000   0.99574       117
      frequency    1.00000   0.99167   0.99582       240
        general    1.00000   1.00000   1.00000        79
genetic changes    1.00000   1.00000   1.00000       223
    information    0.99777   1.00000   0.99888       895
    inheritance    1.00000   1.00000   1.00000       292
        outlook    1.00000   1.00000   1.00000        75
     precaution    0.98413   0.96875   0.97638        64
     prevention    0.95349   0.97619   0.96471        42
       research    1.00000   1.00000   1.00000        73
         stages    1.00000   1.00000   1.00000        12
 support groups    0.00000   0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 7. Save Model

In [None]:
%pip install -q huggingface_hub



In [None]:
# Save the Model to Hugging Face Hub
from huggingface_hub import login

# Login to Hugging Face (you'll need to have an account)
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [None]:
# Push the model to the Hub
trainer.push_to_hub("akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier")
tokenizer.push_to_hub("akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier/commit/a403791f6d3cd794f740b3fd8922fb59ecbed6d4', commit_message='Upload tokenizer', commit_description='', oid='a403791f6d3cd794f740b3fd8922fb59ecbed6d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier'), pr_revision=None, pr_num=None)

In [None]:

base_path_model = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier"
trainer.save_model(base_path_model)
tokenizer.save_pretrained(base_path_model)

('/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier/tokenizer_config.json',
 '/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier/special_tokens_map.json',
 '/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier/vocab.txt',
 '/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier/added_tokens.json',
 '/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-clas

# Training Pipeline for Question Classification using TF-IDF

## 1. Libraries and Dependencies

In [None]:
# Import additional libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

## 2. Data Gathering and Preparation

In [None]:
# 'df' already contains the combined dataset
# If not, load and combine datasets as before

## 3. Feature Extraction with TF-IDF

In [None]:
# Extract the 'Question' and 'qtype' columns
X = df['Question']
y = df['qtype']

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

### 3.1 Split Data into Training and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

### 3.2 TF-IDF Vectorization

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
                        max_features=5000,
                        ngram_range=(1,2)
                    )

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation data
X_val_tfidf = tfidf_vectorizer.transform(X_val)

## 4. Model Training with Random Forest

In [None]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_tfidf, y_train)

## 5. Model Evaluation

In [None]:
# Predict on the validation set
y_pred = rf_classifier.predict(X_val_tfidf)

### 5.1 Evaluation Metrics

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Ensure the labels parameter matches the target names
label_list = list(range(len(label_encoder.classes_)))

# Generate classification report
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, labels=label_list, digits=5)
print(report)

Validation Accuracy: 0.99693
                 precision    recall  f1-score   support

         causes    1.00000   1.00000   1.00000       159
  complications    1.00000   0.83333   0.90909         6
 considerations    1.00000   1.00000   1.00000        48
           desc    1.00000   1.00000   1.00000       128
    dont answer    0.00000   0.00000   0.00000         0
exams and tests    0.99145   0.99145   0.99145       117
      frequency    1.00000   0.99583   0.99791       240
        general    1.00000   1.00000   1.00000        79
genetic changes    1.00000   1.00000   1.00000       223
    information    0.99888   1.00000   0.99944       895
    inheritance    1.00000   1.00000   1.00000       292
        outlook    1.00000   1.00000   1.00000        75
     precaution    0.93846   0.95312   0.94574        64
     prevention    0.90476   0.90476   0.90476        42
       research    1.00000   1.00000   1.00000        73
         stages    1.00000   1.00000   1.00000        12
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 6. Save Model

In [None]:
import joblib
import os

base_path_model = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/tfidf-randomforest-medical-question-classifier"

if not os.path.exists(base_path_model):
    os.makedirs(base_path_model)


# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, f'{base_path_model}/tfidf_vectorizer.joblib')

# Save the Random Forest classifier
joblib.dump(rf_classifier, f'{base_path_model}/random_forest_classifier.joblib')

['/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/tfidf-randomforest-medical-question-classifier/random_forest_classifier.joblib']

In [None]:
%pip install huggingface_hub

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository, login

# Log in to Hugging Face
login()  # You'll be prompted to enter your Hugging Face token

In [None]:
# Clone the repository
!git lfs install
repository = Repository(local_dir="tfidf-randomforest-medical-question-classifier", clone_from="akmaldika/tfidf-randomforest-medical-question-classifier")

# Move the model files into the repository directory
!export base_path_model="/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/tfidf-randomforest-medical-question-classifier"
!cp $base_path_model/tfidf_vectorizer.joblib tfidf-randomforest-medical-question-classifier/
!cp $base_path_model/random_forest_classifier.joblib tfidf-randomforest-medical-question-classifier/

# Navigate to the repository directory and push the files
%cd tfidf-randomforest-medical-question-classifier
!git add .
!git commit -m "Update TF-IDF vectorizer and Random Forest Meidcal Question classifier"
!git push
%cd ..

# Training Pipeline for Question Classification using SVD

## 1. Libraries and Dependencies

In [None]:
# Import additional libraries
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression

## 2. Data Gathering and Preparation

In [None]:
# 'df' already contains the combined dataset
# If not, load and combine datasets as before

## 3. Feature Extraction with TF-IDF and SVD

In [None]:
# Extract the 'Question' and 'qtype' columns
X = df['Question']
y = df['qtype']

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

### 3.1 Split Data into Training and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

### 3.2 TF-IDF Vectorization

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation data
X_val_tfidf = tfidf_vectorizer.transform(X_val)

### 3.3 Dimensionality Reduction with SVD

In [None]:
from sklearn.decomposition import TruncatedSVD

# Initialize TruncatedSVD
n_components = 300  # You can adjust this number based on experimentation
svd = TruncatedSVD(n_components=n_components, random_state=42)

# Fit and transform the training data
X_train_svd = svd.fit_transform(X_train_tfidf)

# Transform the validation data
X_val_svd = svd.transform(X_val_tfidf)

## 4. Model Training with Random Forest

In [None]:
# Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train_svd, y_train)

## 5. Model Evaluation

In [None]:
# Predict on the validation set
y_pred = rf_classifier.predict(X_val_svd)

### 5.1 Evaluation Metrics

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Ensure the labels parameter matches the target names
label_list = list(range(len(label_encoder.classes_)))

# Generate classification report
report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, labels=label_list, digits=5)
print(report)

Validation Accuracy: 0.99721
                 precision    recall  f1-score   support

         causes    1.00000   1.00000   1.00000       159
  complications    1.00000   0.66667   0.80000         6
 considerations    1.00000   1.00000   1.00000        48
           desc    0.99225   1.00000   0.99611       128
    dont answer    0.00000   0.00000   0.00000         0
exams and tests    0.98305   0.99145   0.98723       117
      frequency    1.00000   0.99583   0.99791       240
        general    1.00000   1.00000   1.00000        79
genetic changes    1.00000   1.00000   1.00000       223
    information    0.99777   1.00000   0.99888       895
    inheritance    1.00000   1.00000   1.00000       292
        outlook    1.00000   1.00000   1.00000        75
     precaution    0.98438   0.98438   0.98438        64
     prevention    0.97561   0.95238   0.96386        42
       research    1.00000   1.00000   1.00000        73
         stages    1.00000   1.00000   1.00000        12
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 6. Save Model

In [None]:
import joblib
import os

base_path_model = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/svd-randomforest-medical-question-classifier"

if not os.path.exists(base_path_model):
    os.makedirs(base_path_model)

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, f'{base_path_model}/tfidf_vectorizer.joblib')

# Save the SVD transformer
joblib.dump(svd, f'{base_path_model}/svd_transformer.joblib')

# Save the Random Forest classifier
joblib.dump(rf_classifier, f'{base_path_model}/random_forest_classifier.joblib')

['/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification/svd-randomforest-medical-question-classifier/random_forest_classifier.joblib']

In [None]:
%pip install huggingface_hub

In [None]:
from huggingface_hub import HfApi, HfFolder, Repository, login

# Log in to Hugging Face
login()  # You'll be prompted to enter your Hugging Face token

In [None]:
# Clone the repository (replace with your username and repository name)
!git lfs install
repository = Repository(local_dir="svd-logistic-medical-question-classifier", clone_from="your-username/svd-logistic-medical-question-classifier")

# Move the model files into the repository directory
!mv tfidf_vectorizer.joblib svd-transformer.joblib logistic_regression_classifier.joblib svd-logistic-medical-question-classifier/

# Navigate to the repository directory and push the files
%cd svd-logistic-medical-question-classifier
!git add .
!git commit -m "Add TF-IDF vectorizer, SVD transformer, and Logistic Regression classifier"
!git push

# Testing

In [None]:
import pandas as pd
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define base path to the dataset and models
base_path = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/data/prompt_classification"
base_path_model = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification"

# Load datasets and take 10 examples from each
datasets = [
    "generated_general_dataset.csv",
    "generated_desc_dataset.csv",
    "generated_symptom_dataset.csv",
    "generated_precaution_dataset.csv",
    "generated_dont_answer_dataset.csv"
]

examples = []
for dataset in datasets:
    df = pd.read_csv(f"{base_path}/{dataset}")
    examples.append(df.sample(10, random_state=42))

# Combine all examples into a single DataFrame
combined_examples = pd.concat(examples)

# Load the saved models
tfidf_vectorizer = joblib.load(f"{base_path_model}/tfidf-randomforest-medical-question-classifier/tfidf_vectorizer.joblib")
rf_classifier = joblib.load(f"{base_path_model}/tfidf-randomforest-medical-question-classifier/random_forest_classifier.joblib")

svd = joblib.load(f"{base_path_model}/svd-randomforest-medical-question-classifier/svd_transformer.joblib")
rf_classifier_svd = joblib.load(f"{base_path_model}/svd-randomforest-medical-question-classifier/random_forest_classifier.joblib")

tokenizer = AutoTokenizer.from_pretrained("akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier")
model = AutoModelForSequenceClassification.from_pretrained("akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier")

# Load the LabelEncoder
label_encoder = joblib.load(f"{base_path_model}/label_encoder.joblib")

# Function to generate predictions using the TF-IDF + Random Forest model
def predict_tfidf_rf(text):
    text_tfidf = tfidf_vectorizer.transform([text])
    prediction = rf_classifier.predict(text_tfidf)
    return label_encoder.inverse_transform(prediction)[0]

# Function to generate predictions using the TF-IDF + SVD + Random Forest model
def predict_svd_rf(text):
    text_tfidf = tfidf_vectorizer.transform([text])
    text_svd = svd.transform(text_tfidf)
    prediction = rf_classifier_svd.predict(text_svd)
    return label_encoder.inverse_transform(prediction)[0]

# Function to generate predictions using the BERT model
def predict_bert(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([prediction])[0]

# Generate predictions for each example
results = []
for index, row in combined_examples.iterrows():
    text = row["Question"]
    actual = row["qtype"]
    tfidf_rf_prediction = predict_tfidf_rf(text)
    svd_rf_prediction = predict_svd_rf(text)
    bert_prediction = predict_bert(text)
    results.append({
        "Question": text,
        "Actual": actual,
        "TF-IDF + RF Prediction": tfidf_rf_prediction,
        "SVD + RF Prediction": svd_rf_prediction,
        "BERT Prediction": bert_prediction
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Question,Actual,TF-IDF + RF Prediction,SVD + RF Prediction,BERT Prediction
0,I have Chronic cholestasis,general,general,general,general
1,Is it possible that I have GERD?,general,general,general,general
2,I think I might have Hepatitis E,general,general,general,general
3,Could hepatitis A be affecting me?,general,general,desc,dont answer
4,Do I have Arthritis?,general,general,general,general
5,Could AIDS be affecting me?,general,general,desc,dont answer
6,Do I have Typhoid?,general,general,general,general
7,Do I have Acne?,general,general,general,general
8,I have Heart attack,general,general,general,general
9,I have Paralysis (brain hemorrhage),general,general,general,general


In [None]:
import pandas as pd
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import classification_report
import numpy as np

# Define base path to the dataset and models
base_path = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/data/prompt_classification"
base_path_model = "/content/drive/MyDrive/ITB/Semester 7/IF5153 Pemrosesan Bahasa Alami/NLP Tubes 1/models/classification"

# Load the dataset
df = pd.read_csv(f"{base_path}/generated_dont_answer_dataset.csv")

# Load the saved models
tfidf_vectorizer = joblib.load(f"{base_path_model}/tfidf-randomforest-medical-question-classifier/tfidf_vectorizer.joblib")
rf_classifier = joblib.load(f"{base_path_model}/tfidf-randomforest-medical-question-classifier/random_forest_classifier.joblib")

svd = joblib.load(f"{base_path_model}/svd-randomforest-medical-question-classifier/svd_transformer.joblib")
rf_classifier_svd = joblib.load(f"{base_path_model}/svd-randomforest-medical-question-classifier/random_forest_classifier.joblib")

tokenizer = AutoTokenizer.from_pretrained("akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier")
model = AutoModelForSequenceClassification.from_pretrained("akmaldika/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-medical-question-classifier")

# Load the LabelEncoder
label_encoder = joblib.load(f"{base_path_model}/label_encoder.joblib")

# Function to generate predictions using the TF-IDF + Random Forest model
def predict_tfidf_rf(text):
    text_tfidf = tfidf_vectorizer.transform([text])
    prediction = rf_classifier.predict(text_tfidf)
    return prediction[0]

# Function to generate predictions using the TF-IDF + SVD + Random Forest model
def predict_svd_rf(text):
    text_tfidf = tfidf_vectorizer.transform([text])
    text_svd = svd.transform(text_tfidf)
    prediction = rf_classifier_svd.predict(text_svd)
    return prediction[0]

# Function to generate predictions using the BERT model
def predict_bert(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512)
    outputs = model(**inputs)
    prediction = torch.argmax(
        outputs.logits,
        dim=1
        ).item()
    return prediction

# Generate predictions for the entire dataset
tfidf_rf_predictions = df['Question'].apply(predict_tfidf_rf)
svd_rf_predictions = df['Question'].apply(predict_svd_rf)
bert_predictions = df['Question'].apply(predict_bert)

# Encode the true labels
true_labels = label_encoder.transform(df['qtype'])



In [24]:
label_list = list(range(len(label_encoder.classes_)))

# Generate classification reports
tfidf_rf_report = classification_report(true_labels, tfidf_rf_predictions, target_names=label_encoder.classes_, labels=label_list, digits=5)
svd_rf_report = classification_report(true_labels, svd_rf_predictions, target_names=label_encoder.classes_, labels=label_list, digits=5)
bert_report = classification_report(true_labels, bert_predictions, target_names=label_encoder.classes_, labels=label_list, digits=5)

# Print the classification reports
print("TF-IDF + Random Forest Classification Report:")
print(tfidf_rf_report)

print("TF-IDF + SVD + Random Forest Classification Report:")
print(svd_rf_report)

print("BERT Classification Report:")
print(bert_report)

TF-IDF + Random Forest Classification Report:
                 precision    recall  f1-score   support

         causes    0.00000   0.00000   0.00000         0
  complications    0.00000   0.00000   0.00000         0
 considerations    0.00000   0.00000   0.00000         0
           desc    0.00000   0.00000   0.00000         0
    dont answer    1.00000   0.03935   0.07573      3710
exams and tests    0.00000   0.00000   0.00000         0
      frequency    0.00000   0.00000   0.00000         0
        general    0.00000   0.00000   0.00000         0
genetic changes    0.00000   0.00000   0.00000         0
    information    0.00000   0.00000   0.00000         0
    inheritance    0.00000   0.00000   0.00000         0
        outlook    0.00000   0.00000   0.00000         0
     precaution    0.00000   0.00000   0.00000         0
     prevention    0.00000   0.00000   0.00000         0
       research    0.00000   0.00000   0.00000         0
         stages    0.00000   0.00000   0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize