# Movie Review Binary Sentiment Classification

This movie review data comes from http://www.cs.cornell.edu/people/pabo/movie-review-data/ and the sentiment work done by Pang and Lee.

In [None]:
# necessary imports
from os.path import isfile
import matplotlib.pyplot as plt

from toal.stores import BasicStore
from toal.stores.loaders import load_split_multi_label_csv
from toal.learners import BinaryLearner
from toal.samplers import BinaryUncertaintySampler, RandomSampler, BinaryDensitySampler
from toal.evaluators import F1Evaluator
from toal.annotators import SimulatedAnnotator


In [None]:
def download_movie_review_data():
    # TODO
    print("Not implemented yet!")
    pass


In [None]:
# download data and store in CSV
if not isfile('../data/mr.csv'):
    download_movie_review_data()


In [None]:
# Using default encoders, same of writing BasicStore(extractor=TfIdfExtractor(), encoder=GenericLabelBinarizer())
store = BasicStore()
labeled_df, unlabeled_df, unlabeled_map = load_split_multi_label_csv('../data/mr.csv', shuffle=True)
store.append_data( labeled_df, unlabeled_df )

learner = BinaryLearner()
sampler = BinaryUncertaintySampler(learner)
annotator = SimulatedAnnotator(unlabeled_map)


In [None]:
# evaluate initial model on training subset (20% of data)
model = learner.train(store)

train_inst_count = []
f1s = []
f1evaluator = F1Evaluator()
f1 = f1evaluator.evaluate(model, *store.test_XYs)
f1s.append(f1)
print(f"Multiclass learner F1 evaluation score is {f1}")
train_inst_count.append(store.train_XYs[0].shape[0])

In [None]:
while not store.unlabeled_df.empty:
#for i in range(10):
    # sample and annotate new data
    unlabeled_selection = sampler.choose_instances(store, batch_size=50)
    annotated_df = annotator.annotate(unlabeled_selection, store.available_labels)
    store.update_with_annotation(annotated_df)

    # retrain with newly labeled data
    model = learner.train(store)

    # evaluate
    f1 = f1evaluator.evaluate(model, *store.test_XYs)
    f1s.append(f1)
    train_inst_count.append(store.train_XYs[0].shape[0])  # get rows in X
    print(f"Multiclass learner F1 evaluation score is {f1}")


In [None]:
# random sampling for comparison
random_store = BasicStore()
labeled_df, unlabeled_df, unlabeled_map = load_split_multi_label_csv('../data/mr.csv', shuffle=True)
random_store.append_data( labeled_df, unlabeled_df )

learner = BinaryLearner()
sampler = RandomSampler()

annotator = SimulatedAnnotator(unlabeled_map)

model = learner.train(random_store)

random_f1s = []
f1evaluator = F1Evaluator()
f1 = f1evaluator.evaluate(model, *random_store.test_XYs)
random_f1s.append(f1)

while not random_store.unlabeled_df.empty:
    # sample and annotate new data
    unlabeled_selection = sampler.choose_instances(random_store, batch_size=50)
    annotated_df = annotator.annotate(unlabeled_selection, random_store.available_labels)
    random_store.update_with_annotation(annotated_df)

    # retrain with newly labeled data
    model = learner.train(random_store)

    # evaluate
    f1 = f1evaluator.evaluate(model, *random_store.test_XYs)
    random_f1s.append(f1)


In [None]:
plt.plot(train_inst_count, random_f1s, 'g-', label="Random sampling")
plt.plot(train_inst_count, f1s, 'r-', label="Uncertainty sampling")
plt.xlabel('Training Size')
plt.ylabel("F1 (macro)")
plt.legend(loc='best')
plt.grid(True)
plt.show()

In [None]:
def test_sampler(sampler, batch_size):
    store = BasicStore()
    labeled_df, unlabeled_df, unlabeled_map = load_split_multi_label_csv('../data/mr.csv', shuffle=True)
    store.append_data( labeled_df, unlabeled_df )

    learner = BinaryLearner()
    annotator = SimulatedAnnotator(unlabeled_map)

    model = learner.train(store)

    f1s = []
    f1evaluator = F1Evaluator()
    f1 = f1evaluator.evaluate(model, *store.test_XYs)
    f1s.append(f1)

    while not store.unlabeled_df.empty:
        # sample and annotate new data
        unlabeled_selection = sampler.choose_instances(store, batch_size=batch_size)
        annotated_df = annotator.annotate(unlabeled_selection, store.available_labels)
        store.update_with_annotation(annotated_df)

        # retrain with newly labeled data
        model = learner.train(store)

        # evaluate
        f1 = f1evaluator.evaluate(model, *store.test_XYs)
        f1s.append(f1)
        
    return f1s

In [None]:
lc_f1s = test_sampler(BinaryUncertaintySampler(learner, strategy='lc'), 50)
ms_f1s = test_sampler(BinaryUncertaintySampler(learner, strategy='ms'), 50)
dw_f1s = test_sampler(BinaryDensitySampler(learner, strategy='ent', beta=1), 50)
dw2_f1s = test_sampler(BinaryDensitySampler(learner, strategy='ent', beta=2), 50)

In [None]:
plt.plot(train_inst_count, random_f1s, 'g-', label="Random sampling")
plt.plot(train_inst_count, f1s, 'c:', label="Uncertainty sampling (ent)")
plt.plot(train_inst_count, lc_f1s, 'r-', label="Uncertainty sampling (lc)")
plt.plot(train_inst_count, ms_f1s, 'm-.', label="Uncertainty sampling (ms)")
plt.plot(train_inst_count, dw_f1s, 'y-.', label="Density weighted")
plt.plot(train_inst_count, dw2_f1s, 'b-.', label="Density weighted (2)")
plt.xlabel('Training Size')
plt.ylabel("F1 (macro)")
plt.legend(loc='best')
plt.grid(True)
plt.show()