In [62]:
import pandas as pd

In [63]:
def remove_duplicates(list_):
    collect = set()
    for item in list_:
        if item not in collect:
            collect.add(item)
    collect = list(collect)
    return collect

In [64]:
movie_reviews = pd.read_csv('IMDB Dataset.csv').filter(['review'])

reviews = list(movie_reviews['review'])
reviews = remove_duplicates(reviews)

In [65]:
len(reviews)

49582

In [66]:
movie_synopses = pd.read_csv(r'movies_metadata.csv', low_memory=False).filter(['overview'])

synopses = list(movie_synopses['overview'])
synopses = remove_duplicates(synopses)

In [67]:
reviews = list(filter(lambda x: not pd.isna(x), reviews))
synopses = list(filter(lambda x: not pd.isna(x), synopses))

In [68]:
print(len(reviews), len(synopses))

49582 44307


In [69]:
# down sample reviews
reviews = reviews[:len(synopses)]

In [70]:
len(reviews) == len(synopses), len(reviews), len(synopses)

(True, 44307, 44307)

In [71]:
data = reviews + synopses
len(data), len(data)/2, len(synopses), len(reviews)

(88614, 44307.0, 44307, 44307)

In [72]:
# reduce no. characters to 240 -- like Twitter, & speeds up training
data = [text[:240] for text in data]
len(data)

88614

In [73]:
# size = len(data)
labeled_reviews = [(reviews[i][:240], 'Subjective') for i in range(len(reviews))]
labeled_synopses = [(synopses[i][:240], 'Objective') for i in range(len(synopses))]

In [74]:
labeled_reviews = remove_duplicates(labeled_reviews)
labeled_synopses = remove_duplicates(labeled_synopses)

In [75]:
len(labeled_reviews), len(labeled_synopses)

(44285, 44303)

In [139]:
labeled_synopses = labeled_synopses[:len(labeled_reviews)]
len(labeled_reviews), len(labeled_synopses)

(44285, 44285)

In [77]:
labeled_data = labeled_reviews + labeled_synopses
len(labeled_data)

88570

In [164]:
labeled_data = remove_duplicates(labeled_data)
len(labeled_data)

88570

In [165]:
from random import shuffle
shuffle(labeled_data)

In [166]:
df = pd.DataFrame(labeled_data)

In [167]:
df = df.sample(frac=1).reset_index(drop=True)
df = df.rename(columns={0:'Text', 1:'Labels'})

In [168]:
df.head()

Unnamed: 0,Text,Labels
0,A woman vanishes. Her husband inquires into th...,Objective
1,Portrait of the popular Dutch singer André Hazes.,Objective
2,Prepare to meet your Messiah - they call him M...,Subjective
3,"After losing their son, grieving parents stumb...",Objective
4,Fictional account of French artist Henri de To...,Objective


In [169]:
df['Text'].shape

(88570,)

In [170]:
import numpy as np
from sklearn.model_selection import train_test_split

In [171]:
# split data into train, validtion, and test sets

X_train, X_rem, y_train, y_rem = train_test_split(df['Text'], df['Labels'], train_size = 0.8, random_state = 24)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size = 0.5, random_state = 24)

In [123]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

In [173]:
# use roberta base sentiment model
rbs_model = f'cardiffnlp/twitter-roberta-base-sentiment'

tokenizer = AutoTokenizer.from_pretrained(rbs_model)
model = AutoModelForSequenceClassification.from_pretrained(rbs_model)
model.save_pretrained(rbs_model)
tokenizer.save_pretrained(rbs_model)

('cardiffnlp/twitter-roberta-base-sentiment/tokenizer_config.json',
 'cardiffnlp/twitter-roberta-base-sentiment/special_tokens_map.json',
 'cardiffnlp/twitter-roberta-base-sentiment/vocab.json',
 'cardiffnlp/twitter-roberta-base-sentiment/merges.txt',
 'cardiffnlp/twitter-roberta-base-sentiment/added_tokens.json',
 'cardiffnlp/twitter-roberta-base-sentiment/tokenizer.json')

In [206]:
def predict_sentiment(text):
    encoded_text = tokenizer(text, 
                             padding=True, 
                             truncation=True,
                             max_length=512,
                             return_tensors='pt')
    output = model(**encoded_text)

    prediction = torch.argmax(output.logits)
    sentiment = labels[prediction]

    polarities = output[0][0].detach().numpy()
    polarities = softmax(polarities)

    polarity_scores = dict(zip(labels, polarities))

    return sentiment, polarity_scores

In [207]:
def predict_multi_sentiments(text_list):
    return [predict_sentiment(text) for text in texts]

In [208]:
from scipy.special import softmax

labels = ['negative', 'neutral', 'positive']
texts = list(df['Text'][:5])

with torch.no_grad():
    
    predictions = predict_multi_sentiments(texts)
    stmnt_preds = [pred[0] for pred in predictions]

    print(stmnt_preds)

['negative', 'neutral', 'negative', 'neutral', 'neutral']
