In [1]:
import numpy as np
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.vector_space_analysis import *
import warnings
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(preprocess_text)

100%|██████████| 12230/12230 [00:02<00:00, 5325.82it/s]


In [2]:
from src.utils.embeddings import *

glove_embeddings = load_embeddings('embedding_vectors/glove.42B.300d.txt')

def glove_vectorization(input_array):
    return np.array([average_vectorizations(row, glove_embeddings) for row in input_array])

100%|██████████| 1917494/1917494 [02:35<00:00, 12368.05it/s]


# Simple embedding usage

In [38]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
from IPython.display import display

X, y = df['text_pp'], df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('embd', FunctionTransformer(func=glove_vectorization)),
        ('smote', SMOTE(random_state=0)),
        ('svc', SVC()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [02:01<00:00, 24.31s/it]

0.4291





Unnamed: 0,precision,recall,f1,support
1.0,0.512038,0.617334,0.559778,2446.0
2.0,0.350979,0.322567,0.336174,2446.0
3.0,0.331202,0.317661,0.32429,2446.0
4.0,0.392411,0.334015,0.360866,2446.0
5.0,0.547409,0.582993,0.564641,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1510,496,214,76,150
True 2.0,693,789,561,243,160
True 3.0,370,549,777,487,263
True 4.0,196,259,568,817,606
True 5.0,180,155,226,459,1426


In [3]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import FunctionTransformer
from IPython.display import display

X, y = df['text_pp'], df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('embd', FunctionTransformer(func=glove_vectorization)),
        ('smote', SMOTE(random_state=0)),
        ('gnb', GaussianNB()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [00:08<00:00,  1.70s/it]

0.339





Unnamed: 0,precision,recall,f1,support
1.0,0.369984,0.550286,0.442472,2446.0
2.0,0.299145,0.243254,0.26832,2446.0
3.0,0.293328,0.285773,0.289501,2446.0
4.0,0.330654,0.252249,0.286178,2446.0
5.0,0.416737,0.401063,0.40875,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1346,584,188,90,238
True 2.0,785,595,530,285,251
True 3.0,633,414,699,390,310
True 4.0,411,229,615,617,574
True 5.0,463,167,351,484,981


# Embedding-based cascade classification with clusterization

In [3]:
from src.cluster_cascade_classifier import ClusterCascadeClassifier
from src.utils.vector_space_analysis import clusterize_by_vectors, clusterize_by_distance
from sklearn.preprocessing import FunctionTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from IPython.display import display

X, y = df['text_pp'], df['score'].to_numpy()

model_factory = lambda: ClusterCascadeClassifier(
    clustering_func=lambda x: clusterize_by_vectors(x, cluster_count=2),
    cluster_classifier_factory=lambda: Pipeline([
        ('embd', FunctionTransformer(func=glove_vectorization)),
        ('smote', SMOTE(random_state=0)),
        ('mnb', SVC(random_state=0)),
    ]),
    label_classifier_factory=lambda: Pipeline([
        ('embd', FunctionTransformer(func=glove_vectorization)),
        ('smote', SMOTE(random_state=0)),
        ('svc', SVC(random_state=0)),
    ]),
    vectorize_func=glove_vectorization
)

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=model_factory,
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [03:00<00:00, 36.10s/it]

0.4305





Unnamed: 0,precision,recall,f1,support
1.0,0.519436,0.617334,0.56417,2446.0
2.0,0.350292,0.343418,0.346821,2446.0
3.0,0.33204,0.24489,0.281882,2446.0
4.0,0.38342,0.393295,0.388295,2446.0
5.0,0.553216,0.59076,0.571372,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1510,520,171,109,136
True 2.0,689,840,454,301,162
True 3.0,365,626,599,597,259
True 4.0,184,277,413,962,610
True 5.0,159,135,167,540,1445


# Mixed cascade classification

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer
from IPython.display import display

X, y = df['text_pp'], df['score'].to_numpy()

model_factory = lambda: ClusterCascadeClassifier(
    clustering_func=clusterize_by_vectors,
    cluster_classifier_factory=lambda: Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ('smote', SMOTE(random_state=0)),
        ('mnb', SVC(random_state=0)),
    ]),
    label_classifier_factory=lambda: Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ('smote', SMOTE(random_state=0)),
        ('svc', SVC(random_state=0)),
    ]),
    vectorize_func=lambda row: average_vectorizations(row, glove_embeddings)
)

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=model_factory,
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

 60%|██████    | 3/5 [05:05<03:28, 104.26s/it]