In [None]:
!pip install transformers datasets evaluate accelerate



In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm
import math

try:
    from google.colab import drive
    drive.mount('/content/gdrive')

    train_path = '/content/gdrive/MyDrive/advanced-ml-project/data/train.tsv'
    test_path = '/content/gdrive/MyDrive/advanced-ml-project/data/test.tsv'
    dev_path = '/content/gdrive/MyDrive/advanced-ml-project/data/dev.tsv'
except:
    train_path = 'data/train.tsv'
    test_path = 'data/test.tsv'
    dev_path = 'data/dev.tsv'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


device(type='cuda', index=0)

## Load Data

In [None]:
train = pd.read_csv(train_path, sep='\t', header=0)
print('Train:', len(train))
test = pd.read_csv(test_path, sep='\t', header=0)
print('Test:', len(test))
dev = pd.read_csv(dev_path, sep='\t', header=0)
print('Dev:', len(dev))
print(train.label.value_counts())
train.head(10)

Train: 8891
Test: 3245
Dev: 4496
moderate          6019
not depression    1971
severe             901
Name: label, dtype: int64


Unnamed: 0,PID,text,label
0,train_pid_1,Waiting for my mind to have a breakdown once t...,moderate
1,train_pid_2,My new years resolution : I'm gonna get my ass...,moderate
2,train_pid_3,New year : Somone else Feeling like 2020 will ...,moderate
3,train_pid_4,"My story I guess : Hi, Im from Germany and my ...",moderate
4,train_pid_5,Sat in the dark and cried myself going into th...,moderate
5,train_pid_6,I will probably end it when my mum isn't aroun...,moderate
6,train_pid_7,Fuck 2019 : Left abusive relationship. Moved i...,moderate
7,train_pid_8,I am at a new year's eve party and I want to c...,moderate
8,train_pid_9,Death of my father : My father died in the beg...,moderate
9,train_pid_10,Empty and stuck in a loop every day : In any o...,moderate


## Generating emotion scores

In [None]:
tokenizer = AutoTokenizer.from_pretrained("kwang123/bert-sentiment-analysis", token='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')
model = AutoModelForSequenceClassification.from_pretrained(
    "kwang123/bert-sentiment-analysis",
    num_labels=5,
).to(device)
model.eval()

batch_size = 128

In [None]:
text = train['text'].to_list()
emotion_scores = []

for i in tqdm(range(math.ceil(len(text) / batch_size)), desc='Generating emotion score'):
    X = text[i*batch_size:i*batch_size+batch_size] if i*batch_size+batch_size <= len(text) else text[i*batch_size:]
    inputs = tokenizer(X, padding='max_length', truncation=True, return_tensors='pt', max_length=512).to(device)

    with torch.no_grad():
        output = model(**inputs).logits.cpu().tolist()

    emotion_scores += output

train['emotion_scores'] = emotion_scores

Generating emotion score:   0%|          | 0/70 [00:00<?, ?it/s]

In [None]:
text = test['text'].to_list()
emotion_scores = []

for i in tqdm(range(math.ceil(len(text) / batch_size)), desc='Generating emotion score'):
    X = text[i*batch_size:i*batch_size+batch_size] if i*batch_size+batch_size <= len(text) else text[i*batch_size:]
    inputs = tokenizer(X, padding='max_length', truncation=True, return_tensors='pt', max_length=512).to(device)

    with torch.no_grad():
        output = model(**inputs).logits.cpu().tolist()

    emotion_scores += output

test['emotion_scores'] = emotion_scores

Generating emotion score:   0%|          | 0/26 [00:00<?, ?it/s]

In [None]:
text = dev['text'].to_list()
emotion_scores = []

for i in tqdm(range(math.ceil(len(text) / batch_size)), desc='Generating emotion score'):
    X = text[i*batch_size:i*batch_size+batch_size] if i*batch_size+batch_size <= len(text) else text[i*batch_size:]
    inputs = tokenizer(X, padding='max_length', truncation=True, return_tensors='pt', max_length=512).to(device)

    with torch.no_grad():
        output = model(**inputs).logits.cpu().tolist()

    emotion_scores += output

dev['emotion_scores'] = emotion_scores

Generating emotion score:   0%|          | 0/36 [00:00<?, ?it/s]

In [None]:
train.to_csv('/content/gdrive/MyDrive/advanced-ml-project/data/train_emotion.csv', sep='\t', index=False)
test.to_csv('/content/gdrive/MyDrive/advanced-ml-project/data/test_emotion.csv', sep='\t', index=False)
dev.to_csv('/content/gdrive/MyDrive/advanced-ml-project/data/dev_emotion.csv', sep='\t', index=False)