In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.vector_space_analysis import *
from src.utils.common import *
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')



In [9]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings
from flair.data import Sentence

stacked_embeddings = StackedEmbeddings([
    # WordEmbeddings('glove'),
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast'),
])
document_rnn_embeddings = DocumentRNNEmbeddings([stacked_embeddings])

def rnn_vectorization(input_array):
    sentences = [Sentence(text) for text in input_array]
    for sentence in sentences:
        document_rnn_embeddings.embed(sentence)
    return np.array([sentence.get_embedding().detach().numpy() for sentence in sentences])

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
from IPython.display import display

df['text_pp'] = ignore_warnings(lambda: df['text'].progress_apply(lambda row: preprocess_text(row, removing_stopwords=False)))
X, y = df['text_pp'], df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('embd', FunctionTransformer(func=rnn_vectorization)),
        ('smote', SMOTE(random_state=0)),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

In [None]:
from flair.embeddings import TransformerDocumentEmbeddings

roberta_document_embeddings = TransformerDocumentEmbeddings('roberta-base')

def roberta_transformer_vectorization(input_array):
    sentences = [Sentence(text) for text in input_array]
    for sentence in sentences:
        roberta_document_embeddings.embed(sentence)
    return np.array([sentence.get_embedding().detach().numpy() for sentence in sentences])

In [19]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
from IPython.display import display

df['text_pp'] = ignore_warnings(lambda: df['text'].progress_apply(lambda row: preprocess_text(row, removing_stopwords=False)))
X, y = df['text_pp'], df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('embd', FunctionTransformer(func=roberta_transformer_vectorization)),
        ('smote', SMOTE(random_state=0)),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

  0%|          | 0/5 [41:41<?, ?it/s]

KeyboardInterrupt

