In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.classification_analysis import *
import warnings
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(preprocess_text)

100%|██████████| 12230/12230 [00:01<00:00, 6508.23it/s]


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from IPython.display import display

X, y = df['text_pp'].to_numpy(), df['score'].to_numpy()

weighted_f1, report_df, confusion_df, debug_df = get_debug_cross_validation_report(
    df, 'text_pp', 'score',
    model_factory=lambda: Pipeline([
        ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
        ('smote', SMOTE(random_state=0)),
        ('mnb', MultinomialNB()),
    ]),
    seed=0
)
print(weighted_f1)
display(report_df)
display(confusion_df)

100%|██████████| 5/5 [01:08<00:00, 13.69s/it]

0.4573





Unnamed: 0,precision,recall,f1,support
1.0,0.5573,0.58054,0.568682,2446.0
2.0,0.388811,0.340965,0.36332,2446.0
3.0,0.358504,0.415372,0.384848,2446.0
4.0,0.396526,0.438675,0.416537,2446.0
5.0,0.615423,0.502453,0.55323,2446.0


Unnamed: 0,Pred 1.0,Pred 2.0,Pred 3.0,Pred 4.0,Pred 5.0
True 1.0,1420,539,289,122,76
True 2.0,547,834,668,296,101
True 3.0,288,446,1016,549,147
True 4.0,140,193,596,1073,444
True 5.0,153,133,265,666,1229


In [11]:
misclassification_df = get_misclassifications_report(confusion_df, calculate_improvements=True)
misclassification_df

100%|██████████| 5/5 [00:00<00:00, 30.02it/s]


Unnamed: 0,true_label,predicted_label,f1_possible_improvement,priority
0,2.0,3.0,0.0527,1.0
1,3.0,4.0,0.0404,0.735484
2,1.0,2.0,0.0376,0.675269
3,4.0,5.0,0.0374,0.670968
4,2.0,4.0,0.0249,0.402151
5,1.0,3.0,0.0201,0.298925
6,3.0,5.0,0.0125,0.135484
7,2.0,5.0,0.0096,0.073118
8,1.0,4.0,0.0088,0.055914
9,1.0,5.0,0.0062,0.0


In [12]:
misclassification_df = get_misclassifications_report(confusion_df, calculate_improvements=False)
misclassification_df

100%|██████████| 5/5 [00:00<00:00, 4797.88it/s]


Unnamed: 0,true_label,predicted_label,count,priority
0,2.0,3.0,668,1.0
1,3.0,4.0,549,0.798986
2,1.0,2.0,539,0.782095
3,4.0,5.0,444,0.621622
4,2.0,4.0,296,0.371622
5,1.0,3.0,289,0.359797
6,3.0,5.0,147,0.119932
7,1.0,4.0,122,0.077703
8,2.0,5.0,101,0.04223
9,1.0,5.0,76,0.0


In [20]:
get_label_reliability(report_df)

Unnamed: 0,precision,recall,f1,support,reliability
1.0,0.5573,0.58054,0.568682,2446.0,1.0
5.0,0.615423,0.502453,0.55323,2446.0,0.924755
4.0,0.396526,0.438675,0.416537,2446.0,0.25914
3.0,0.358504,0.415372,0.384848,2446.0,0.104834
2.0,0.388811,0.340965,0.36332,2446.0,0.0


In [5]:
def get_examples(true_label, predicted_label, df, true_label_column_name, predicted_label_column_name):
    return df[(df[true_label_column_name] == true_label) & (df[predicted_label_column_name] == predicted_label)]

examples_df = get_examples(2.0, 3.0, debug_df, true_label_column_name='score', predicted_label_column_name='predicted')

In [6]:
examples_df.iloc[0].to_dict()

{'text': "Tase is good, not as bold as I prefer. But I love strong coffee.  But the reason I cannot give 5 stars is THREE times now I have had a brewing problem that I have not had with other brands.  TWO TIMES, the coffee went all over the brewing part and not a straight shot into the cup.  Today, I noticed several grains in my cup, so rathr than waste a cup, I strained it through a filter and enjoyed my cup.  I have bought Tully's before, but nt from this seller. Wonder why the problems so many of us are reporting?",
 'summary': 'Wish I had reviews - having problems with this',
 'score': 2,
 'text_pp': 'tase good bold prefer love strong coffee reason cannot give stars three times brewing problem brands two times coffee went brewing part straight shot cup today noticed several grains cup rathr waste cup strained filter enjoyed cup bought tully s before nt seller wonder problems many us reporting',
 'predicted': 3.0}