In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Import necessary libraries
!pip install datasets
import pandas as pd
import string
import nltk
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the datasets
train_file_path = '/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/cleaned_train_data.csv'
test_file_path = '/content/drive/MyDrive/Colab Notebooks/INFO 5731/Group 9 Products/Project Share Folder/cleaned_test_data.csv'

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Limit to the first 1000 rows for both train and test
train_data = train_data.head(1000)
test_data = test_data.head(1000)

# Load BioBERT tokenizer (BioBERT is based on BERT)
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

# Tokenization function
def tokenize_function(examples):
    examples['cleaned_review'] = [' '.join(map(str, review)) for review in examples['cleaned_review']]
    return tokenizer(examples['cleaned_review'], padding="max_length", truncation=True, max_length=128)

# Convert data to the Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_data[['cleaned_review']])
test_dataset = Dataset.from_pandas(test_data[['cleaned_review']])

# Apply tokenization to both datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load BioBERT pre-trained model (for text classification)
model = BertForSequenceClassification.from_pretrained("dmis-lab/biobert-v1.1", num_labels=2)  # Assuming binary classification

# weights and biases are not required
os.environ["WANDB_DISABLED"] = "true"

# Prepare for training
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate every epoch
)

# Initialize Trainer with our model and data
trainer = Trainer(
    model=model,                         # the pre-trained model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (test set)
    tokenizer=tokenizer                  # tokenizer
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model('./side_effect_model')




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [None]:
# Import necessary libraries
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load the datasets (adjust file paths as needed)
train_file_path = '/content/drugsComTrain_raw.csv'  # Replace with your train dataset file path
test_file_path = '/content/drugsComTest_raw.csv'    # Replace with your test dataset file path

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Limit to the first 1000 rows
train_data = train_data.head(1000)
test_data = test_data.head(1000)

# Define stopwords
stop_words = set(stopwords.words('english'))

# Define punctuations
punctuations = set(string.punctuation)

# Clean text function using predefined stopwords
def clean_text(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text.lower())  # Convert to lowercase and tokenize
    # Remove stopwords and punctuation
    cleaned_tokens = [word for word in tokens if word not in stop_words and word not in punctuations]
    # Join tokens back into a string
    return ' '.join(cleaned_tokens)

# Filter the datasets to relevant columns
train_relevant = train_data[['drugName', 'condition', 'review']].copy()
test_relevant = test_data[['drugName', 'condition', 'review']].copy()

# Drop rows with missing reviews
train_relevant.dropna(subset=['review'], inplace=True)
test_relevant.dropna(subset=['review'], inplace=True)

# Apply text cleaning to the review column
train_relevant['cleaned_review'] = train_relevant['review'].apply(clean_text)
test_relevant['cleaned_review'] = test_relevant['review'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
