In [200]:
import pandas as pd
import numpy as np
import os
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

In [2]:
# Read the dataset
data = pd.read_csv('../data/reviews.tsv', delimiter='\t', encoding='latin-1')

In [3]:
print('Number of rows in dataset: ', data.shape[0])
print('Number of columns in dataset: ', data.shape[1])

Number of rows in dataset:  54432
Number of columns in dataset:  8


In [4]:
data.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [5]:
# Subset the dataset on the columns "review" and "rating"
data = data[['review', 'rating']]

In [6]:
# Check for missing values and compute the percentage of missing values per column
data.isnull().mean() * 100

review    10.220091
rating    24.832819
dtype: float64

In [7]:
# Drop the rows where at least on element is missing
data = data.dropna()

In [8]:
print('Final data shape: ', data.shape)

Final data shape:  (35379, 2)


In [9]:
# Check the frequencies of the target
frequencies = data['rating'].value_counts()
frequencies

3/4       3324
3/5       3169
4/5       2984
2/4       2483
2/5       2246
2.5/4     2201
3.5/4     1688
3.5/5     1202
B         1077
1.5/4     1031
5/5       1022
4/4        942
2.5/5      918
1/5        797
B+         781
B-         756
1/4        744
C          708
C+         616
4.5/5      514
A-         479
7/10       454
8/10       453
C-         451
6/10       399
A          379
1.5/5      357
5/10       309
D          283
9/10       279
          ... 
1.7          1
3.3          1
9.2          1
4.0/10       1
0.5/10       1
F+           1
7.7          1
7.4          1
2.3/5        1
1.6/5        1
6.2          1
7.1          1
7.3/10       1
4.1/10       1
2.7/5        1
4.3/10       1
2.6/6        1
2.6/5        1
3/2          1
2.2/5        1
0/6          1
7.8          1
3.0/10       1
4.2/5        1
4.1          1
2.3/4        1
7.9          1
8.5          1
1/2          1
9.7          1
Name: rating, Length: 185, dtype: int64

We can see that there are many different grading scales in the target like the following:
- 3/4, 2.5/4 ...
- 4/5, 3/5 ...
- 0/6, 2.6/6 ...
- A, B+, C- F- ...
- 4.9, 7.4, 9.2, 3 1/2 ...

So we have to standardize the grading system of the target. Based on this website http://emanuellevy.com/comment/how-we-grade-movies-from-a-to-f/ we are going to convert the output to this grading scale:
- A (10)
- A- (9)
- B+ (8)
- B (7)
- B- (6)
- C+ (5)
- C (4)
- C- (3)
- D+ (2)
- D (1)
- F (0)

So we will have 11 classes to predict and so our machine learning problem becomes a multi-class classification problem. 

In [14]:
rating_system = {0: 'F', 1: 'D', 2: 'D+', 3: 'C-', 4: 'C', 5: 'C+', 6: 'B-', 7: 'B', 8: 'B+', 9: 'A-', 10: 'A'}

def convert_grading_system(rating):
    """
    Standardize the ratings --> Create 11 classes
    e.g 3 1/2 --> 3.5 --> 4 --> C
        7.9 --> 8 --> B+
        4.3/5 --> 8.6 --> 9 --> A-
        A+ --> A
        D- --> D
        F-, F+, T --> F
    """
    # catch the cases 3 1/2 ...
    if ' ' in rating:
        rating = rating.split()
        final_r = 0
        for r in rating:
            if '/' in r:
                numerator, denominator = map(float, r.split('/'))
                final_r += numerator / denominator
            else:
                final_r += float(r)
        final_r = np.round(final_r)
                
    # catch the cases 3.5/5, 4/4 ...
    elif '/' in rating:
        numerator, denominator = map(float, rating.split('/'))
        # there is a rating 5.5/5
        if numerator > denominator:
            numerator = denominator
        final_r = np.round(numerator / denominator * 10)
    
    elif re.match('\d-\d', rating):
        numerator, denominator = map(float, rating.split('-'))
        final_r = np.round(numerator / denominator * 10)
    
    elif re.match('[A-Z]', rating):
        if rating == 'A+':
            rating = 'A'
        elif rating == 'D-':
            rating = 'D'
        elif rating == 'F-' or rating == 'F+' or rating == 'T' # check the shape of Train/Valid/Test sets
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_valid shape: ', X_valid.shape)
print('y_valid shape: ', y_valid.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

print('Percentage of Train set: ', X_train.shape[0] / X.shape[0] * 100)
print('Percentage of Validation set: ', X_valid.shape[0] / X.shape[0] * 100)
print('Percentage of Test set: ', X_test.shape[0] / X.shape[0] * 100)or rating == 'N' or rating == 'R':
            rating = 'F'
        return rating
    
    else:
        final_r = np.round(float(rating))
    
    return rating_system[final_r]

In [15]:
# standardize the target 
data['rating'] = data['rating'].apply(lambda x: convert_grading_system(x))
data['rating'].unique()  # 11 classes

array(['B-', 'C', 'C+', 'B', 'B+', 'A', 'D+', 'C-', 'A-', 'F', 'D'],
      dtype=object)

In [17]:
# check the frequencies of the target
data['rating'].value_counts() / data['rating'].shape[0] * 100

B+    22.151559
B-    18.895390
C+    12.433930
C     12.182368
A-     8.510698
B      8.129116
A      6.984369
D+     5.200825
C-     2.786964
D      1.806156
F      0.918624
Name: rating, dtype: float64

In [18]:
# subset reviews and the target (rating)
X = data['review']
y = data['rating']

# split into Train/Valid/Test sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2)
X_valid, X_test, y_valid, y_test = train_test_split(X_valid, y_valid, stratify=y_valid, test_size=0.5)

# check the shape of Train/Valid/Test sets
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_valid shape: ', X_valid.shape)
print('y_valid shape: ', y_valid.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

print('Percentage of Train set: ', X_train.shape[0] / X.shape[0] * 100)
print('Percentage of Validation set: ', X_valid.shape[0] / X.shape[0] * 100)
print('Percentage of Test set: ', X_test.shape[0] / X.shape[0] * 100)

X_train shape:  (28303,)
y_train shape:  (28303,)
X_valid shape:  (3538,)
y_valid shape:  (3538,)
X_test shape:  (3538,)
y_test shape:  (3538,)
Percentage of Train set:  79.99943469289693
Percentage of Validation set:  10.000282653551542
Percentage of Test set:  10.000282653551542


In [19]:
# check the distribution of ratings in Train/Valid/Test sets
target_distribution = pd.concat([y.value_counts() / y.shape[0], y_train.value_counts() / y_train.shape[0], y_valid.value_counts() / y_valid.shape[0] , y_test.value_counts() / y_test.shape[0]],
                                axis=1)
target_distribution.columns = ['Overall', 'Train', 'Validation', 'Test']
target_distribution

Unnamed: 0,Overall,Train,Validation,Test
B+,0.221516,0.221496,0.221594,0.221594
B-,0.188954,0.188955,0.188807,0.18909
C+,0.124339,0.124333,0.124364,0.124364
C,0.121824,0.121825,0.12182,0.12182
A-,0.085107,0.085115,0.085076,0.085076
B,0.081291,0.081299,0.081402,0.081119
A,0.069844,0.069851,0.069813,0.069813
D+,0.052008,0.052009,0.052007,0.052007
C-,0.02787,0.027877,0.027699,0.027982
D,0.018062,0.018055,0.018089,0.018089


We did a 80/10/10 Train/Valid/Test stratified split in order to have target proportions almost identical to those in the full dataset. We did this split because we have a limited amount of data.

It is clear that there is a huge class imbalance. So we cannot use accuracy as a metric to evaluate our models. In this case we can use ROC curves or PR curves. PR curves are usually used in problems when the positive class is of great importance and more interesting than the negative class/es. In our case we don't have this requirement so we will use the ROC AUC score as our metric to evaluate our models. 

In my analysis I am going to use BERT (Bidirectional Encoder Representations for Transformers), a method released by Google in late 2018 that have achieved state-of-the-art performances on 11 NLP tasks like SQuAD v1.1, GLUE etc. The major reason that I use this method is the fact that I have a limited amount of data. So I am going to use transfer learning, a method mainly used with great success on Computer vision, but now can be used in NLP. The idea is simply to use a pre-trained network on a massive dataset and fine-tune the network in my task in which I have limited amount of data. I decided not to use other methods like Word2Vec or Fasttext because they both have a key limitation that will affect the performance of my model, that is, they don't take into account the context of the sentence in which the word appears. As a result the word "play" will have the same vector in these 2 sentences "I play football", "Seeing a live dramatic play", despite the fact that the meaning is different. BERT takes into account the context in order to compute the vector that represents the meaning of the word. 

BERT learns the representations by being pre-trained on two different tasks:

1. In a sentence with some words removed, BERT is trained to predict those missing words 

2. Given 2 sentences, BERT is trying to find if the the second sentence comes after the first sentence in a text, or they are completely unrelated

Given those pre-trained network we can fine-tune the final layers on our task

First of all we have to to prepare our dataset in a format that BERT expects.

In [23]:
def prepare_examples_bert(data, target=None):
    df = pd.DataFrame()

    df['sentence'] = data
    df['alpha'] = 'a'
    df['id'] = range(data.shape[0])

    if target is not None:
        df['target'] = target
        df = df[['id', 'target', 'alpha', 'sentence']]
    else:
        df = df[['id', 'sentence']]
    return df

In [28]:
# Encode labels between the values 0 to n_classes-1
label_encoder = LabelEncoder()

label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
y_valid = label_encoder.transform(y_valid)
y_test = label_encoder.transform(y_test)

In [34]:
label_encoder_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_encoder_name_mapping

{'A': 0,
 'A-': 1,
 'B': 2,
 'B+': 3,
 'B-': 4,
 'C': 5,
 'C+': 6,
 'C-': 7,
 'D': 8,
 'D+': 9,
 'F': 10}

In [39]:
# create datasets BERT
train = prepare_examples_bert(X_train, y_train)
dev = prepare_examples_bert(X_valid, y_valid)
test = prepare_examples_bert(X_test, None)

In [40]:
train.head()

Unnamed: 0,id,target,alpha,sentence
37129,0,1,a,Exactly the kind of slam-bang adventure that's...
7438,1,2,a,"Needless to say, this freewheeling parade of b..."
37371,2,3,a,"Almodovar adopts a playfully wicked attitude, ..."
2454,3,3,a,I have nothing against the film for what it is...
37814,4,4,a,Both the film and the book retain the echoing ...


In [41]:
dev.head()

Unnamed: 0,id,target,alpha,sentence
10664,0,4,a,Any student of rock history would have to be c...
28514,1,4,a,Two thirds is so packed with charming characte...
6855,2,5,a,a testament to Roberts' early star power and l...
6993,3,1,a,"Though it runs slow at times, Down To The Bone..."
52866,4,1,a,Bridges is another example of Eastwood's remar...


In [42]:
test.head()

Unnamed: 0,id,sentence
9306,0,"Lungulov's touch is delicate, even piercingly ..."
23743,1,This is one of those movies that leaves you fe...
15850,2,...primarily resembles one of those late-night...
28826,3,It serves as a forceful reminder of how small ...
10746,4,Those inclined to scrutinize the logic of Barr...


Train and Validation set have the same format.
1. Column **id** is the number of the row
2. Column **target** is the actual label of the text (0 - 10)
3. Column  **alpha** it's always *a* because we don't have a second sentence to find if related
4. Column **sentence** is the text that we want to classify

The test set has the **id** (number of row) and the **sentence**

In [46]:
INPUT_DATA_DIR = 'bert_model/input_data'

# save the datasets
train.to_csv(os.path.join(INPUT_DATA_DIR, 'train.tsv'), sep='\t', index=False, header=False)
valid.to_csv(os.path.join(INPUT_DATA_DIR, 'dev.tsv'), sep='\t', index=False, header=False)
test.to_csv(os.path.join(INPUT_DATA_DIR, 'test.tsv'), sep='\t', index=False, header=True)
fpr, tpr, _ = roc_curve(y_test, y_predict_probabilities)
roc_auc = auc(fpr, tpr)
np.save(os.path.join(INPUT_DATA_DIR, 'test_labels.npy'), y_test)

For fine-tuning the authors of BERT propose the following settings:
1. Dropout = 0.1
2. Learning rate 5e-5 3e-5 2e-5
3. Number of epochs 3, 4
4. Optimizer Adam
5. Batch size 32

So we use the above settings to fine-tune our model

In [190]:
%%bash
export INPUT_DATA_DIR='bert_model/input_data'
export BERT_PRETRAINED_DIR='bert_model/bert_pretrained_models/uncased_L-12_H-768_A-12'
for learning_rate in 5e-5 3e-5 2e-5
do
    mkdir -p bert_model/out/lr_$learning_rate
    
    python bert_model/run_classifier.py --task_name=cola --do_train=true --do_eval=true --data_dir=bert_model/input_data --vocab_file=$BERT_PRETRAINED_DIR/vocab.txt --bert_config_file=$BERT_PRETRAINED_DIR/bert_config.json --init_checkpoint=$BERT_PRETRAINED_DIR/bert_model.ckpt --max_seq_length=64 --train_batch_size=32 --learning_rate=$learning_rate --num_train_epochs=3.0 --output_dir=bert_model/out/lr_$learning_rate
done


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

labels_ids:  Tensor("IteratorGetNext:3", shape=(?,), dtype=int32)
predictions:  Tensor("ArgMax:0", shape=(?,), dtype=int32)
is_real_example:  Tensor("Cast:0", shape=(?,), dtype=float32)

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

labels_ids:  Tensor("IteratorGetNext:3", shape=(?,), dtype=int32)
predictions:  Tensor("ArgMax:0", shape=(?,), dtype=int32)
is_real_example:  Tensor("Cast:0", shape=(?,), dtype=float32)

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow

INFO:tensorflow:Using config: {'_model_dir': 'bert_model/out/lr_5e-5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f91956b4588>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=No

In [191]:
# get the latest checkpoint and predict the test set
OUT_BASE_FOLDER = 'bert_model/out'

folders = os.listdir(OUT_BASE_FOLDER)
for folder in folders:
    path = os.path.join(OUT_BASE_FOLDER, folder)
    files = os.listdir(path)
    ckpt_files = set([re.search('ckpt-\d+', file).group() for file in files if 'of' in file])
    best_model = sorted(ckpt_files, reverse=True)[0]
    best_model = [file for file in files if re.search('{}.*of'.format(best_model), file)][0]
    number = best_model.split('.')[1].split('-')[1]
    for file in files:
        if number in file:
            os.rename(os.path.join(path, file), os.path.join(path, file.replace('-'+number, '')))

In [192]:
%%bash
export BERT_PRETRAINED_DIR='bert_model/bert_pretrained_models/uncased_L-12_H-768_A-12'
export TRAINED_CLASSIFIER='model.ckptother'
for learning_rate in 5e-5 3e-5 2e-5
do
    python bert_model/run_classifier.py --task_name=cola --do_predict=true --data_dir=bert_model/input_data --vocab_file=$BERT_PRETRAINED_DIR/vocab.txt --bert_config_file=$BERT_PRETRAINED_DIR/bert_config.json --init_checkpoint=bert_model/out/lr_$learning_rate/$TRAINED_CLASSIFIER --max_seq_length=64 --output_dir=bert_model/out/lr_$learning_rate
done


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



INFO:tensorflow:Using config: {'_model_dir': 'bert_model/out/lr_5e-5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f542fb33278>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=No

In [193]:
def compute_auc(y_test, y_score):
    lb = LabelBinarizer()
    lb.fit(test_target)
    y_test = lb.transform(test_target)
    return roc_auc_score(y_true=y_test, y_score=y_score)

In [194]:
# get results in validation and test set
def get_final_metrics():
    final_results = []
    learning_rates = ['2e-5', '3e-5', '5e-5']
    for lr in learning_rates:
        # get predictions
        valid_predictions = pd.read_csv('bert_model/out/lr_{}/eval_results.tsv'.format(lr), sep='\t', header=None)
        test_predictions = pd.read_csv('bert_model/out/lr_{}/test_results.tsv'.format(lr), sep='\t', header=None)
        
        # get actuals
        valid_target = pd.read_csv('bert_model/input_data/dev.tsv', sep='\t', header=None).iloc[:, 1]
        test_target = np.load('bert_model/input_data/test_labels.npy')
        
        acc_valid = accuracy_score(y_true=valid_target, y_pred=np.argmax(valid_predictions.values, axis=1))
        acc_test = accuracy_score(y_true=test_target, y_pred=np.argmax(test_predictions.values, axis=1))
        
        roc_valid = compute_auc(valid_target, valid_predictions)
        roc_test = compute_auc(test_target, test_predictions)
        
        final_results.append([lr, acc_valid, roc_valid, acc_test, roc_test])
        
    return pd.DataFrame(data=final_results, columns=['lr', 'acc_valid', 'roc_valid', 'acc_test', 'roc_test'])

In [195]:
get_final_metrics()

Unnamed: 0,lr,acc_valid,roc_valid,acc_test,roc_test
0,2e-05,0.352176,0.499405,0.358677,0.79059
1,3e-05,0.345958,0.496548,0.360373,0.789829
2,5e-05,0.351328,0.500858,0.35472,0.792081


So we will pick the model with the highest roc_auc in the valiodation set, that is the model that was trained with learning rate = 5e-5. So our model has an accuracy=35.4% on the test period and roc_auc=79.2%

In [204]:
# get prediction
test_predictions = pd.read_csv('bert_model/out/lr_5e-5/test_results.tsv', sep='\t', header=None).values
test_predictions = np.argmax(test_predictions, axis=1)
# get actuals
test_target = np.load('bert_model/input_data/test_labels.npy')

In [205]:
confusion_matrix(y_true=test_target, y_pred=test_predictions)

array([[ 99,  50,   1,  74,  18,   1,   3,   0,   0,   1,   0],
       [ 53,  43,   0, 156,  36,   6,   7,   0,   0,   0,   0],
       [ 13,  12,   0, 147,  90,   7,  16,   0,   0,   2,   0],
       [ 63,  78,   1, 450, 141,  16,  28,   0,   0,   7,   0],
       [  4,  10,   2, 199, 275,  43, 122,   0,   0,  14,   0],
       [  1,   2,   0,  27,  76, 160, 110,   0,   0,  53,   2],
       [  4,   5,   0,  36,  99, 107, 160,   0,   1,  28,   0],
       [  0,   2,   0,   7,  10,  30,  24,   0,   1,  25,   0],
       [  8,   2,   0,   5,   9,  17,   7,   0,   0,  16,   0],
       [  1,   1,   0,   8,   9,  68,  29,   0,   0,  65,   3],
       [  0,   0,   0,   2,   2,   6,   5,   0,   0,  14,   3]])

In [207]:
print(classification_report(y_true=test_target, y_pred=test_predictions))

              precision    recall  f1-score   support

           0       0.40      0.40      0.40       247
           1       0.21      0.14      0.17       301
           2       0.00      0.00      0.00       287
           3       0.41      0.57      0.47       784
           4       0.36      0.41      0.38       669
           5       0.35      0.37      0.36       431
           6       0.31      0.36      0.34       440
           7       0.00      0.00      0.00        99
           8       0.00      0.00      0.00        64
           9       0.29      0.35      0.32       184
          10       0.38      0.09      0.15        32

   micro avg       0.35      0.35      0.35      3538
   macro avg       0.25      0.25      0.24      3538
weighted avg       0.30      0.35      0.32      3538



In [208]:
label_encoder_name_mapping

{'A': 0,
 'A-': 1,
 'B': 2,
 'B+': 3,
 'B-': 4,
 'C': 5,
 'C+': 6,
 'C-': 7,
 'D': 8,
 'D+': 9,
 'F': 10}

From the confusion matrix it is clear that when the true value of the rating is B, then if the model misclassifies the rating, it is more likely to predict rating close to B like B+ or B-. So I will reduce the number of categories to 5 (A, B, C, D, F) and see how our model performs with less categories.

In [209]:
less_categories_mapping = {0:0, 1:0, 2:1, 3:1, 4:1, 5:2, 6:2, 7:2, 8:3, 9:3, 10:4}

In [214]:
test_predictions_less = [less_categories_mapping[i] for i in test_predictions]
test_target_less = [less_categories_mapping[i] for i in test_target]

In [219]:
# distribution of the target
pd.Series(test_target_less).value_counts() / pd.Series(test_target_less).shape[0] * 100

1    49.180328
2    27.416620
0    15.488977
3     7.009610
4     0.904466
dtype: float64

In [216]:
accuracy_score(y_true=test_target_less, y_pred=test_predictions_less)

0.6288863764838892

We can see an improvement on the accuracy when we have less categories

Obviously we cannot use this model on reviews from other sites and expect to have good results, because the model is fine-tuned on the reviews from rotten tomatoes. But we can use the same approach, that is fine-tuning the pretrained BERT model on the reviews of the new source.

I trained my models on my GPU NVIDIA GTX 1080Ti