In [1]:
import pandas as pd
t_data = pd.read_csv("/kaggle/input/d/king2001/depression/depress.csv")
print(t_data.info())
v_data = pd.read_csv("/kaggle/input/d/king2001/depression/depress_valid.csv")
print(v_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49547 entries, 0 to 49546
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    49547 non-null  object
 1   label   49547 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 774.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9202 entries, 0 to 9201
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9202 non-null   object
 1   label   9202 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 143.9+ KB
None


In [2]:
from datasets import Dataset, DatasetDict
emotions = DatasetDict({
    'train': Dataset.from_pandas(t_data),
    'validation': Dataset.from_pandas(v_data)
})
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 49547
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 9202
    })
})

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

from transformers import AutoModelForSequenceClassification
num_labels = 2
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))
emotions_encoded["train"].features

cuda


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]



Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [4]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
emotions_encoded["train"].features

{'text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [5]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [15]:
from transformers import Trainer, TrainingArguments

batch_size = 34
logging_steps = len(emotions_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="steps",
                                  save_strategy="no",
                                  disable_tqdm=False)

In [16]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"])
trainer.train();

Step,Training Loss,Validation Loss,Accuracy,F1
500,0.2058,0.116158,0.95827,0.958319
1000,0.1254,0.086338,0.969681,0.969697




In [17]:
results = trainer.evaluate()
results

{'eval_loss': 0.07701198756694794,
 'eval_accuracy': 0.9742447294066507,
 'eval_f1': 0.9742525066671834,
 'eval_runtime': 89.8132,
 'eval_samples_per_second': 102.457,
 'eval_steps_per_second': 1.514,
 'epoch': 2.0}

In [24]:
trainer.save_model('distilbert-base-uncased-depress-model')

In [22]:
model.save_pretrained("distilbert-base-uncased-depress")
tokenizer.save_pretrained('./model/distilbert-base-uncased-depress')

('./model/distilbert-base-uncased-depress/tokenizer_config.json',
 './model/distilbert-base-uncased-depress/special_tokens_map.json',
 './model/distilbert-base-uncased-depress/vocab.txt',
 './model/distilbert-base-uncased-depress/added_tokens.json',
 './model/distilbert-base-uncased-depress/tokenizer.json')

In [18]:
preds_output = trainer.predict(emotions_encoded["validation"])
preds_output.metrics



{'test_loss': 0.07701198756694794,
 'test_accuracy': 0.9742447294066507,
 'test_f1': 0.9742525066671834,
 'test_runtime': 90.1225,
 'test_samples_per_second': 102.105,
 'test_steps_per_second': 1.509}

In [27]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

download_file("/kaggle/working/distilbert-base-uncased-depress-model","distilbert-base-uncased-depress-model")

In [26]:
model.save_pretrained('./model')
tokenizer.save_pretrained('./distilbert-base-uncased-depress-model')

('./distilbert-base-uncased-depress-model/tokenizer_config.json',
 './distilbert-base-uncased-depress-model/special_tokens_map.json',
 './distilbert-base-uncased-depress-model/vocab.txt',
 './distilbert-base-uncased-depress-model/added_tokens.json',
 './distilbert-base-uncased-depress-model/tokenizer.json')