In [1]:
import pandas as pd

In [2]:
# get movie reviews
movie_reviews = pd.read_csv('IMDB Dataset.csv').filter(['review'])
reviews = list(movie_reviews['review'])

#### Function to remove duplicate texts

In [3]:
def remove_duplicates(list_):
    collect = set()
    for item in list_:
        if item not in collect:
            collect.add(item)
    collect = list(collect)
    return collect

In [4]:
reviews = remove_duplicates(reviews)

In [5]:
# get movie descriptions
movie_synopses = pd.read_csv(r'movies_metadata.csv', low_memory=False).filter(['overview'])

synopses = list(movie_synopses['overview'])
synopses = remove_duplicates(synopses)

#### Function to clean the reviews dataset

In [6]:
import re

def clean_reviews(text_list):
    for text in text_list:
        # it will be too obvious when training the model if personal pronouns are included
        text = re.sub('I', '', text)  
        text = re.sub('my', '', text)
        text = re.sub("I've", '', text)
        text = re.sub("I have", '', text)
        
        # remove character sequences that only appear in reviews 
        text = re.sub('<br', '', text)
        text = re.sub('/>', '', text)
        text = re.sub("\'", "'", text)
        text = text.strip(' ')
    return text_list

In [7]:
# remove null text
reviews = list(filter(lambda x: not pd.isna(x), reviews))
synopses = list(filter(lambda x: not pd.isna(x), synopses))

# clean reviews
reviews = clean_reviews(reviews)

#### Apply labels & reduce no. characters to 240  ( Twitter's max char. count - and also speeds up training)

In [8]:
labeled_reviews = [(reviews[i][:240], 'Subjective') for i in range(len(reviews))]
labeled_synopses = [(synopses[i][:240], 'Objective') for i in range(len(synopses))]

In [9]:
# some text might be the same for first 240 characters, so remove duplicates again
labeled_reviews = remove_duplicates(labeled_reviews)
labeled_synopses = remove_duplicates(labeled_synopses)

In [10]:
len(labeled_reviews), len(labeled_synopses)

(49555, 44303)

### Reduce no. of texts to 5000 for each label

In [11]:
labeled_reviews = labeled_reviews[:5000]
labeled_synopses = labeled_synopses[:5000]
len(labeled_reviews), len(labeled_synopses)

(5000, 5000)

In [12]:
# merge to one list
labeled_data = labeled_reviews + labeled_synopses
len(labeled_data)

10000

In [13]:
labeled_data = remove_duplicates(labeled_data)
len(labeled_data)

10000

In [15]:
from random import shuffle
shuffle(labeled_data)

In [16]:
df = pd.DataFrame(labeled_data)

In [17]:
df = df.sample(frac=1).reset_index(drop=True)
df = df.rename(columns={0:'Text', 1:'labels'})

# Prepare Model and Split Data

#### Load fast tokenizer for distil roberta 

In [18]:
# load fast tokenizer for distil roberta 
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained('distilroberta-base')

In [19]:
import numpy as np
from sklearn.model_selection import train_test_split

In [20]:
# split data into train, validtion, and test sets

X_train, X_rem, y_train, y_rem = train_test_split(df['Text'], 
                                                  df['labels'], 
                                                  train_size = 0.8, 
                                                  random_state = 24
                                                 )

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, 
                                                    y_rem, 
                                                    test_size = 0.5, 
                                                    random_state = 24
                                                   )

In [21]:
# create torch dataset
import torch

class IMDbObjSet(torch.utils.data.Dataset):
    def __init__(self, pd_series, labels):
        self.encodings = tokenizer(pd_series.tolist(), padding=True, truncation=True) 
        self.labels = list(labels)
        
    def __getitem__(self, idx):

        item = {key: torch.tensor(feature[idx]) for key, feature in self.encodings.items()}
        item['labels'] = torch.tensor(0) if (self.labels[idx] == 'Objective') else torch.tensor(1)
        return item
        
    def __len__(self):
        return len(self.labels)

In [22]:
torch.tensor(0)

tensor(0)

In [23]:
train_dataset = IMDbObjSet(X_train, y_train)
val_dataset = IMDbObjSet(X_rem, y_rem)
test_dataset = IMDbObjSet(X_test, y_test)

In [24]:
test_data = test_dataset

In [25]:
# verify by comparing the label with the y_test label (0 if objective, 1 if subjective)
test_data[0]['labels']

tensor(0)

#### So the text at test_data[0] should have the 'Objective' label (it is a synopsis). 
#### Verify below 

In [26]:
print('TEXT: ' + str(X_test.tolist()[0]))
print()
print('LABEL: ' + str(y_test.tolist()[0]))

TEXT: The life and career of vaudevillian and silent screen horror star Lon Chaney, his contentious relationship with his neurotic wife, and his premature death.

LABEL: Objective


# Train the model

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

model = RobertaForSequenceClassification.from_pretrained('distilroberta-base') 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 

Step,Training Loss
10,0.6948
20,0.691
30,0.6914
40,0.6815
