In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.classification_analysis import *
import warnings
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(preprocess_text)

100%|██████████| 12230/12230 [00:02<00:00, 4201.57it/s]


In [19]:
entry_column = 'text_pp'
label_column = 'score'
filter_take = 1
model_name = 'tf_idf__smote__mnb__label_denaming_take_{}_iter_{}'
path = 'data/reports/label_denaming_iterations/'

frame = df.copy()
frame[label_column] = frame[label_column].astype(str)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

def perform_model_cv(dataframe):
    vectors, labels = dataframe[entry_column].to_numpy(), dataframe[label_column].to_numpy()

    f1, classification_report, confusion_report = get_cross_validation_report(
        vectors, labels,
        model_factory=lambda: Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
            ('smote', SMOTE(random_state=0)),
            ('mnb', MultinomialNB()),
        ]),
        seed=0
    )

    return f1, classification_report, confusion_report

In [None]:
denaming_label = 'other'

original_frame = frame.copy()
bad_labels_max = original_frame[label_column].value_counts().max()

results = []
bad_labels = []
while len(frame[label_column].unique()) > 2:
    current_labels = frame[label_column].unique().tolist()
    weighted_f1, report_df, confusion_df = perform_model_cv(frame)

    label_errors_df = get_label_errors(confusion_df, report_df, ignore_label=denaming_label)
    worst_labels = label_errors_df.iloc[:filter_take]['label'].to_list()

    results.append({
        'labels': current_labels,
        'label_count': len(current_labels),
        'weighted_f1': weighted_f1,
        'worst_labels': worst_labels,
    })

    bad_labels.extend(worst_labels)
    bad_labels_frame = original_frame[original_frame[label_column].isin(bad_labels)].copy()
    bad_labels_frame[label_column] = denaming_label
    if len(bad_labels_frame) < bad_labels_max:
        bad_labels_sample = bad_labels_frame.sample(frac=1)
    else:
        bad_labels_sample = bad_labels_frame.sample(n=bad_labels_max)

    frame = frame[~(frame[label_column].isin(worst_labels))]
    frame = frame[frame[label_column] != denaming_label]
    frame = pd.concat([frame, bad_labels_sample])

    save_reports(
        model_name.format(filter_take, len(results)),
        report_df, confusion_df, weighted_f1,
        default_path=path,
        additional_frames={
            'current_labels': pd.DataFrame({'labels': current_labels}),
            'label_errors': label_errors_df,
        }
    )


results_df = pd.DataFrame(results)
results_df.to_excel(f'{path}/results.xlsx', index=False)
results_df.sort_values(by='weighted_f1', ascending=False)