In [1]:
import pandas as pd

In [2]:
def remove_duplicates(list_):
    collect = set()
    for item in list_:
        if item not in collect:
            collect.add(item)
    collect = list(collect)
    return collect

### Function to clean the reviews dataset

In [104]:
import re

def clean_reviews(text_list):
    for text in text_list:
        # it will be too obvious when training the model if personal pronouns are included
        text = re.sub('I', '', text)  
        text = re.sub('my', '', text)
        text = re.sub("I've", '', text)
        text = re.sub("I have", '', text)
        
        # remove character sequences that only appear in reviews 
        text = re.sub('<br', '', text)
        text = re.sub('/>', '', text)
        text = re.sub("\'", "'", text)
        text = text.strip(' ')
    return text_list

In [4]:
movie_reviews = pd.read_csv('IMDB Dataset.csv').filter(['review'])

reviews = list(movie_reviews['review'])
reviews = remove_duplicates(reviews)

In [5]:
movie_synopses = pd.read_csv(r'movies_metadata.csv', low_memory=False).filter(['overview'])

synopses = list(movie_synopses['overview'])
synopses = remove_duplicates(synopses)

In [6]:
# remove null text
reviews = list(filter(lambda x: not pd.isna(x), reviews))
synopses = list(filter(lambda x: not pd.isna(x), synopses))

# clean reviews
reviews = clean_reviews(reviews)

In [7]:
# down sample reviews to same length as synopses
reviews = reviews[:len(synopses)]

In [8]:
len(reviews) == len(synopses), len(reviews), len(synopses)

(True, 44307, 44307)

In [9]:
# apply labels & set max char. count to 240 -- like Twitter
labeled_reviews = [(reviews[i][:240], 'Subjective') for i in range(len(reviews))]
labeled_synopses = [(synopses[i][:240], 'Objective') for i in range(len(synopses))]

In [10]:
# some text might be the same for first 240 characters, so remove duplicates again
labeled_reviews = remove_duplicates(labeled_reviews)
labeled_synopses = remove_duplicates(labeled_synopses)

In [11]:
len(labeled_reviews), len(labeled_synopses)

(44287, 44303)

In [12]:
# down sample synopses to same length as reviews
labeled_synopses = labeled_synopses[:len(labeled_reviews)]
len(labeled_reviews), len(labeled_synopses)

(44287, 44287)

In [13]:
# merge to one list
labeled_data = labeled_reviews + labeled_synopses
len(labeled_data)

88574

In [14]:
labeled_data = remove_duplicates(labeled_data)
len(labeled_data)

88574

In [15]:
from random import shuffle
shuffle(labeled_data)

In [16]:
df = pd.DataFrame(labeled_data)

In [17]:
df = df.sample(frac=1).reset_index(drop=True)
df = df.rename(columns={0:'Text', 1:'Labels'})

# Prepare Model and Split Data

In [18]:
# load tokenizer for roberta base sentiment model
from transformers import AutoTokenizer

rbs_model = f'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(rbs_model)

In [19]:
import numpy as np
from sklearn.model_selection import train_test_split

In [76]:
# split data into train, validtion, and test sets

X_train, X_rem, y_train, y_rem = train_test_split(df['Text'], 
                                                  df['Labels'], 
                                                  train_size = 0.8, 
                                                  random_state = 24
                                                 )

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, 
                                                    y_rem, 
                                                    test_size = 0.5, 
                                                    random_state = 24
                                                   )

In [97]:
# create torch dataset
import torch

class IMDbObj(torch.utils.data.Dataset):
    def __init__(self, pd_series, labels):
        pd_series = list(pd_series)
        labels = list(labels)
        self.encodings = tokenizer(pd_series, truncation=True, padding=True)
        self.labels = labels
        
    def __getitem__(self, index):
        return {self.encodings[index]: self.labels[index]}

    def __len__(self):
        return len(self.labels)

In [98]:
# train_dataset = IMDbObj(train_encodings, y_train)
# val_dataset = IMDbObj(val_encodings, y_rem)
test_dataset = IMDbObj(X_test, y_test)

In [99]:
test_data = test_dataset

In [101]:
first_data = test_data[0]

In [102]:
first_data

{Encoding(num_tokens=273, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]): 'Subjective'}

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

model = AutoModelForSequenceClassification.from_pretrained(rbs_model) 

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()