In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from src.utils.text_preprocessing import preprocess_text, tokenize
from src.utils.reporting import get_cross_validation_report
from src.utils.classification_analysis import *
import warnings
from tqdm import tqdm
tqdm.pandas()

df = pd.read_csv('data/reviews_excerpt.csv')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    df['text_pp'] = df['text'].progress_apply(preprocess_text)

100%|██████████| 12230/12230 [00:01<00:00, 6529.00it/s]


In [8]:
entry_column = 'text_pp'
label_column = 'score'
filter_take = 1
model_name = 'tf_idf__smote__mnb__label_denaming_take_{}_iter_{}'
path = 'data/reports/label_denaming_iterations/'

frame = df.copy()
frame[label_column] = frame[label_column].astype(str)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

def perform_model_cv(dataframe):
    vectors, labels = dataframe[entry_column].to_numpy(), dataframe[label_column].to_numpy()

    f1, classification_report, confusion_report = get_cross_validation_report(
        vectors, labels,
        model_factory=lambda: Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda row: tokenize(row, stem=True))),
            ('smote', SMOTE(random_state=0)),
            ('mnb', MultinomialNB()),
        ]),
        seed=0
    )

    return f1, classification_report, confusion_report

In [10]:
denaming_label = 'other'

results = []
while len(frame[label_column].unique()) > 2:
    current_labels = frame[label_column].unique().tolist()
    weighted_f1, report_df, confusion_df = perform_model_cv(frame)

    label_errors_df = get_label_errors(confusion_df, report_df, ignore_label=denaming_label)
    worst_labels = label_errors_df.iloc[:filter_take]['label'].to_list()

    results.append({
        'labels': current_labels,
        'label_count': len(current_labels),
        'weighted_f1': weighted_f1,
        'worst_labels': worst_labels,
    })
    frame.loc[frame[label_column].isin(worst_labels), label_column] = denaming_label

    save_reports(
        model_name.format(filter_take, len(results)),
        report_df, confusion_df, weighted_f1,
        default_path=path,
        additional_frames={
            'current_labels': pd.DataFrame({'labels': current_labels}),
            'label_errors': label_errors_df,
        }
    )


results_df = pd.DataFrame(results)
results_df.to_excel(f'{path}/results.xlsx', index=False)
results_df.sort_values(by='weighted_f1', ascending=False)

100%|██████████| 5/5 [01:18<00:00, 15.71s/it]
100%|██████████| 5/5 [00:00<00:00, 5053.38it/s]
100%|██████████| 5/5 [01:17<00:00, 15.56s/it]
100%|██████████| 5/5 [00:00<00:00, 5015.91it/s]
100%|██████████| 5/5 [01:17<00:00, 15.54s/it]
100%|██████████| 4/4 [00:00<00:00, 4009.85it/s]
100%|██████████| 5/5 [01:10<00:00, 14.20s/it]
100%|██████████| 3/3 [00:00<00:00, 2971.17it/s]


Unnamed: 0,labels,label_count,weighted_f1,worst_labels
3,"[1, other, 5]",3,0.6528,[5]
2,"[1, other, 4, 5]",4,0.5263,[4]
0,"[1, 2, 3, 4, 5]",5,0.4573,[2]
1,"[1, other, 3, 4, 5]",5,0.4573,[3]
