In [1]:
import numpy as np
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.vector_space_analysis import *
from src.utils.common import *
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(lambda row: preprocess_text(row, removing_stopwords=False))

100%|██████████| 12230/12230 [00:02<00:00, 5135.06it/s]


In [2]:
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch


class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, bert_tokenizer, bert_model, *, max_length=60, embedding_func=None):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text):
        tokenized_data = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            padding=True,
            max_length=self.max_length
        )['input_ids']

        # Create an attention mask telling BERT to use all words
        attention_mask = [1] * len(tokenized_data)
        return (
            torch.tensor(tokenized_data).unsqueeze(0),
            torch.tensor(attention_mask).unsqueeze(0),
        )

    def _tokenize_and_predict(self, text):
        tokenized, attention_mask = self._tokenize(text)
        embeddings = self.model(tokenized, attention_mask)
        return self.embedding_func(embeddings)

    def transform(self, text_entries):
        if isinstance(text_entries, pd.Series):
            text_entries = text_entries.tolist()

        with torch.no_grad():
            return torch.stack([self._tokenize_and_predict(text) for text in text_entries])

    def fit(self, entries, labels=None):
        return self

In [3]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from IPython.display import display

max_length = 140

cc = df['text'].apply(lambda row: len(row))
sample_df = df[cc <= max_length].copy().reset_index(drop=True)

X, y = sample_df['text_pp'], sample_df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('bert', BertTransformer(tokenizer, bert_model, max_length=max_length)),
        ('smote', SMOTE()),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

 20%|██        | 1/5 [01:18<05:13, 78.41s/it]

In [52]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import FeatureUnion
from IPython.display import display

max_length = 140

cc = df['text'].apply(lambda row: len(row))
sample_df = df[cc <= max_length].copy().reset_index(drop=True)

X, y = sample_df['text_pp'], sample_df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('transform', FeatureUnion([
            ('bert', BertTransformer(tokenizer, bert_model, max_length=max_length)),
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ])),
        ('smote', SMOTE()),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [07:54<00:00, 94.83s/it] 

0.4706





Unnamed: 0,precision,recall,f1,support
1.0,0.551613,0.566225,0.558824,302.0
2.0,0.373585,0.39759,0.385214,249.0
3.0,0.348148,0.408696,0.376,230.0
4.0,0.379167,0.305369,0.33829,298.0
5.0,0.618182,0.608696,0.613402,391.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,171,68,37,10,16
True 2.0,54,99,66,18,12
True 3.0,37,55,94,31,13
True 4.0,18,28,55,91,106
True 5.0,30,15,18,90,238
