In [1]:
import numpy as np
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.vector_space_analysis import *
from src.utils.common import *
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(lambda row: preprocess_text(row, removing_stopwords=False))

100%|██████████| 12230/12230 [00:02<00:00, 5780.99it/s]


In [3]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Simple approach

In [5]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from IPython.display import display
from src.bert_transformer import BertTransformer

max_length = 140

cc = df['text'].apply(lambda row: len(row))
sample_df = df[cc <= max_length].copy().reset_index(drop=True)

X, y = sample_df['text_pp'], sample_df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('bert', BertTransformer(tokenizer, bert_model, max_length=max_length, use_attention_mask=False)),
        ('smote', SMOTE()),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [06:54<00:00, 83.00s/it] 

0.4596





Unnamed: 0,precision,recall,f1,support
1.0,0.540881,0.569536,0.554839,302.0
2.0,0.325203,0.321285,0.323232,249.0
3.0,0.3391,0.426087,0.377649,230.0
4.0,0.388393,0.291946,0.333333,298.0
5.0,0.615776,0.618926,0.617347,391.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,172,69,34,11,16
True 2.0,62,80,74,20,13
True 3.0,35,55,98,25,17
True 4.0,18,29,59,87,105
True 5.0,31,13,24,81,242


# Approach with attention mask

In [6]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from IPython.display import display
from src.bert_transformer import BertTransformer

max_length = 140

cc = df['text'].apply(lambda row: len(row))
sample_df = df[cc <= max_length].copy().reset_index(drop=True)

X, y = sample_df['text_pp'], sample_df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('bert', BertTransformer(tokenizer, bert_model, max_length=max_length, use_attention_mask=True)),
        ('smote', SMOTE()),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [06:03<00:00, 72.80s/it]

0.4602





Unnamed: 0,precision,recall,f1,support
1.0,0.532258,0.546358,0.539216,302.0
2.0,0.326923,0.341365,0.333988,249.0
3.0,0.330882,0.391304,0.358566,230.0
4.0,0.39916,0.318792,0.354478,298.0
5.0,0.620513,0.618926,0.619718,391.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,165,76,33,11,17
True 2.0,59,85,71,21,13
True 3.0,41,56,90,30,13
True 4.0,15,29,54,95,105
True 5.0,30,14,24,81,242


# Feature union

In [7]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import FeatureUnion
from IPython.display import display

max_length = 140

cc = df['text'].apply(lambda row: len(row))
sample_df = df[cc <= max_length].copy().reset_index(drop=True)

X, y = sample_df['text_pp'], sample_df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('transform', FeatureUnion([
            ('bert', BertTransformer(tokenizer, bert_model, max_length=max_length)),
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ])),
        ('smote', SMOTE()),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
  blocks = np.asarray(blocks, dtype='object')
100%|██████████| 5/5 [05:50<00:00, 70.14s/it]

0.4607





Unnamed: 0,precision,recall,f1,support
1.0,0.545455,0.576159,0.560386,302.0
2.0,0.34902,0.35743,0.353175,249.0
3.0,0.32967,0.391304,0.357853,230.0
4.0,0.364017,0.291946,0.324022,298.0
5.0,0.622396,0.611253,0.616774,391.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,174,65,37,13,13
True 2.0,59,89,67,21,13
True 3.0,39,56,90,32,13
True 4.0,17,30,58,87,106
True 5.0,30,15,21,86,239
