In [1]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate


device = torch.device("cuda")

EPOCH = 1 
BATCH = 6 
SEED = 4222
LEARNING_RATE = 1e-5
SAVE_PATH = ".model/bert"
CHECKPOINT_PATH = ".model/bert_checkpoint"
LOG_PATH = ".model/bert_checkpoint/logs"


In [2]:
import pandas as pd
import numpy as np

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.string_)

In [3]:

df = pd.read_csv(".data/Suicide_Detection_Final_Clean.csv", header=0, names = ['text', 'label'])
df = df.reset_index()
df['label'] = df['label'].map({'suicide':1, 'non-suicide':0})


train, temp = train_test_split(df, random_state=SEED, test_size=0.25, stratify=df['label'])
val, test = train_test_split(temp,random_state=SEED, test_size=0.5, stratify=temp['label'])
     

In [4]:
# HuggingFace
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def dataset_conversion(train, test, val):
  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset,
                      "test": test_dataset,
                      "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

In [7]:
def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/83985 [00:00<?, ? examples/s]

Map:   0%|          | 0/13998 [00:00<?, ? examples/s]

Map:   0%|          | 0/13997 [00:00<?, ? examples/s]

In [12]:
SAMPLE_SIZE = 1500
train_dataset = tokenized_datasets["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
test_dataset = tokenized_datasets["test"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
val_dataset = tokenized_datasets["val"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))

In [13]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir=SAVE_PATH,
    overwrite_output_dir = True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCH,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    seed=SEED,
    logging_dir=LOG_PATH,
    save_strategy="steps",
    save_steps=1500
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
trainer.train()

  0%|          | 0/250 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.save_model(SAVE_PATH)

In [None]:
trainer.predict(test_dataset).metrics

In [1]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate


device = torch.device("cuda")

EPOCH = 1 
BATCH = 6 
SEED = 4222
LEARNING_RATE = 1e-5
SAVE_PATH = ".model/bert2"
CHECKPOINT_PATH = ".model/bert_checkpoint"
LOG_PATH = ".model/bert_checkpoint/logs"

import pandas as pd
import numpy as np

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(axis=1)
    return df[indices_to_keep].astype(np.string_)


df = pd.read_csv(".data/Suicide_Detection_Final_Clean.csv", header=0, names = ['text', 'label'])
df = df.reset_index()
df['label'] = df['label'].map({'suicide':1, 'non-suicide':0})


train, temp = train_test_split(df, random_state=SEED, test_size=0.25, stratify=df['label'])
val, test = train_test_split(temp,random_state=SEED, test_size=0.5, stratify=temp['label'])

# HuggingFace
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

def dataset_conversion(train, test, val):
  train.reset_index(drop=True, inplace=True)
  test.reset_index(drop=True, inplace=True)
  val.reset_index(drop=True, inplace=True)

  train_dataset = Dataset.from_pandas(train)
  test_dataset = Dataset.from_pandas(test)
  val_dataset = Dataset.from_pandas(val)

  return DatasetDict({"train": train_dataset,
                      "test": test_dataset,
                      "val": val_dataset})

raw_datasets = dataset_conversion(train, test, val)

def tokenize_function(dataset):
    return tokenizer(dataset["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


SAMPLE_SIZE =  3500
train_dataset = tokenized_datasets["train"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
test_dataset = tokenized_datasets["test"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))
val_dataset = tokenized_datasets["val"].shuffle(seed=SEED).select(range(SAMPLE_SIZE))


# train_dataset = tokenized_datasets["train"]
# test_dataset = tokenized_datasets["test"]
# val_dataset = tokenized_datasets["val"]

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir=SAVE_PATH,
    overwrite_output_dir = True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCH,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    seed=SEED,
    logging_dir=LOG_PATH,
    save_strategy="steps",
    save_steps=1500
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


model = AutoModelForSequenceClassification.from_pretrained("./.model/bert2", num_labels=2)   


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print(trainer.predict(test_dataset).metrics)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/83985 [00:00<?, ? examples/s]

Map:   0%|          | 0/13998 [00:00<?, ? examples/s]

Map:   0%|          | 0/13997 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/584 [00:00<?, ?it/s]

{'test_loss': 0.19344021379947662, 'test_accuracy': 0.9497142857142857, 'test_runtime': 1676.1433, 'test_samples_per_second': 2.088, 'test_steps_per_second': 0.348}
