In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm


In [None]:
! pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel

df_all = pd.read_csv('input/depression_data.csv')

df_train, df_test = train_test_split(df_all, test_size = 0.2, random_state=13)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained("bert-base-uncased")

In [6]:
from typing import Callable, List, Optional, Tuple
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch

class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 60,
            embedding_func: Optional[Callable[[torch.tensor], torch.tensor]] = None,
    ):
        self.bert_tokenizer = bert_tokenizer
        self.bert_model = bert_model
        self.bert_model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
        tokenized_text = self.bert_tokenizer.encode_plus(text,
                                                         add_special_tokens=True,
                                                         max_length=self.max_length
                                                         )["input_ids"]

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_text)

        # bert takes in a batch so we need to unsqueeze the rows
        return (
            torch.tensor(tokenized_text).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized, attention_mask = self._tokenize(text)

        embeddings = self.bert_model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(string) for string in text])

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

In [7]:
from sklearn.pipeline import Pipeline

bert_transformer = BertTransformer(tokenizer, model)

classifier = svm.SVC(kernel='linear')

model = Pipeline([
        ("vectorizer", bert_transformer),
        ("classifier", classifier),
        ])

In [22]:
train_X = df_train['clean_text']
train_y = df_train['is_depression']

In [23]:
model.fit(train_X, train_y)

Pipeline(steps=[('vectorizer',
                 BertTransformer(bert_model=BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder...
                                 bert_tokenizer=PreTrainedTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}),
                                 embedding_func=<function BertTransformer.__init__.<locals>.<lambda> at 0x7f41a6306f80>)),
                ('classifier', SVC(kernel='linear'))])

In [25]:
test_X = df_test['clean_text']
test_y = df_test['is_depression']

print(len(test_X))

predictions = model.predict(test_X)

1547


In [35]:
failed = 0
for prediction, answer in zip(predictions, test_y):
  if prediction != answer:
    failed += 1

print(f"Succes rate: {100 - round(failed/1547*100, 2)} %")

Succes rate: 94.05 %
