In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
import warnings
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(preprocess_text)

100%|██████████| 12230/12230 [00:03<00:00, 3572.72it/s]


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from IPython.display import display

count_vectorizer = CountVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))
X, y = count_vectorizer.fit_transform(df['text_pp']), df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(X, y, model_factory=lambda: MultinomialNB(), seed=0)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [00:00<00:00, 106.78it/s]

0.4474





Unnamed: 0,precision,recall,f1,support
1.0,0.531591,0.557236,0.544112,2446.0
2.0,0.38296,0.363859,0.373166,2446.0
3.0,0.355881,0.387163,0.370864,2446.0
4.0,0.390625,0.388389,0.389504,2446.0
5.0,0.583815,0.536795,0.559318,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1363,612,265,106,100
True 2.0,529,890,614,284,129
True 3.0,319,473,947,526,181
True 4.0,173,221,576,950,526
True 5.0,180,128,259,566,1313


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from IPython.display import display

X, y = df['text_pp'].to_numpy(), df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ('smote', SMOTE(random_state=0)),
        ('mnb', MultinomialNB()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [01:19<00:00, 15.83s/it]

0.4573





Unnamed: 0,precision,recall,f1,support
1.0,0.5573,0.58054,0.568682,2446.0
2.0,0.388811,0.340965,0.36332,2446.0
3.0,0.358504,0.415372,0.384848,2446.0
4.0,0.396526,0.438675,0.416537,2446.0
5.0,0.615423,0.502453,0.55323,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1420,539,289,122,76
True 2.0,547,834,668,296,101
True 3.0,288,446,1016,549,147
True 4.0,140,193,596,1073,444
True 5.0,153,133,265,666,1229


In [14]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC

X, y = df['text_pp'], df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ('smote', SMOTE(random_state=0)),
        ('svc', SVC(random_state=0)),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [06:11<00:00, 74.24s/it]

0.4993





Unnamed: 0,precision,recall,f1,support
1.0,0.591262,0.652903,0.620556,2446.0
2.0,0.426496,0.408013,0.41705,2446.0
3.0,0.416024,0.396975,0.406276,2446.0
4.0,0.450698,0.40924,0.428969,2446.0
5.0,0.601367,0.647588,0.623622,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1597,472,177,76,124
True 2.0,569,998,528,220,131
True 3.0,275,512,971,489,199
True 4.0,135,220,494,1001,596
True 5.0,125,138,164,435,1584


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from IPython.display import display

X, y = df['text_pp'].to_numpy(), df['score'].to_numpy()

weighted_f1, report_df, confusion_df = get_cross_validation_report(
    X, y,
    model_factory=lambda: Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ('smote', SMOTE(random_state=0)),
        ('vote', VotingClassifier(
                estimators=[
                    ('mnb', MultinomialNB()),
                    ('svc', SVC(random_state=0)),
                    ('rfc', RandomForestClassifier(random_state=0))
                ],
                voting='hard'
            )
        ),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [07:41<00:00, 92.34s/it]

0.4925





Unnamed: 0,precision,recall,f1,support
1.0,0.540341,0.687244,0.605003,2446.0
2.0,0.423248,0.390025,0.405957,2446.0
3.0,0.415545,0.391251,0.403032,2446.0
4.0,0.457556,0.401063,0.427451,2446.0
5.0,0.624483,0.617334,0.620888,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1681,426,171,72,96
True 2.0,672,954,501,202,117
True 3.0,384,496,957,453,156
True 4.0,197,235,494,981,539
True 5.0,177,143,180,436,1510


# Best model so far - pipeline with TF-IDF, SMOTE, SVC

In [None]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

import numpy as np

model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
    ('smote', SMOTE(random_state=0)),
    ('svc', SVC(random_state=0))
])

params = {
    'svc__C': list(np.logspace(-3, 0, 4)) + [2, 5, 10],
    'svc__gamma': [0.001, 0.01, 0.1, 1, 2, 3, 5]
}
X, y = df['text_pp'], df['score'].to_numpy()

halving_gscv = HalvingGridSearchCV(
    estimator=model,
    param_grid=params,
    scoring='f1_weighted',
    verbose=1
)
halving_gscv.fit(X, y)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 452
max_resources_: 12230
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 49
n_resources: 452
Fitting 5 folds for each of 49 candidates, totalling 245 fits
----------
iter: 1
n_candidates: 17
n_resources: 1356
Fitting 5 folds for each of 17 candidates, totalling 85 fits
----------
iter: 2
n_candidates: 6
n_resources: 4068
Fitting 5 folds for each of 6 candidates, totalling 30 fits
