In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.classification_analysis import *
import warnings
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(preprocess_text)

100%|██████████| 12230/12230 [00:02<00:00, 5902.65it/s]


In [3]:
entry_column = 'text_pp'
label_column = 'score'

frame = df.copy()
frame[label_column] = frame[label_column].astype(str)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

def perform_model_cv(dataframe):
    vectors, labels = dataframe[entry_column].to_numpy(), dataframe[label_column].to_numpy()

    f1, classification_report, confusion_report = get_cross_validation_report(
        vectors, labels,
        model_factory=lambda: Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
            ('smote', SMOTE(random_state=0)),
            ('mnb', MultinomialNB()),
        ]),
        seed=0
    )

    return f1, classification_report, confusion_report

In [6]:
path = 'data/reports/label_filtering_iterations/'

results = []
while len(frame[label_column].unique()) > 1:
    current_labels = frame[label_column].unique().tolist()
    weighted_f1, report_df, confusion_df = perform_model_cv(frame)

    label_errors_df = get_label_errors(confusion_df, report_df)
    worst_label = label_errors_df.iloc[0]['label']

    results.append({
        'labels': current_labels,
        'label_count': len(current_labels),
        'weighted_f1': weighted_f1,
        'worst_label': worst_label,
    })
    frame = frame[frame[label_column] != worst_label]

    save_reports(
        f'tf_idf__smote__mnb__label_iteration_{len(results)}',
        report_df, confusion_df, weighted_f1,
        default_path=path,
        additional_frames={
            'current_labels': pd.DataFrame({'labels': current_labels}),
            'label_errors': label_errors_df,
        }
    )


results_df = pd.DataFrame(results)
results_df.to_excel(f'{path}/results.xlsx', index=False)
results_df.sort_values(by='weighted_f1', ascending=False)

100%|██████████| 5/5 [01:01<00:00, 12.28s/it]
100%|██████████| 5/5 [00:00<?, ?it/s]
100%|██████████| 5/5 [00:48<00:00,  9.76s/it]
100%|██████████| 4/4 [00:00<00:00, 3982.25it/s]
100%|██████████| 5/5 [00:37<00:00,  7.45s/it]
100%|██████████| 3/3 [00:00<00:00, 5838.94it/s]
100%|██████████| 5/5 [00:22<00:00,  4.48s/it]
100%|██████████| 2/2 [00:00<?, ?it/s]


Unnamed: 0,labels,label_count,weighted_f1,worst_label
3,"[1, 5]",2,0.8778,1
2,"[1, 3, 5]",3,0.7012,3
1,"[1, 3, 4, 5]",4,0.5454,4
0,"[1, 2, 3, 4, 5]",5,0.4573,2
