In [62]:
import pandas as pd

In [63]:
def remove_duplicates(list_):
    collect = set()
    for item in list_:
        if item not in collect:
            collect.add(item)
    collect = list(collect)
    return collect

In [64]:
movie_reviews = pd.read_csv('IMDB Dataset.csv').filter(['review'])

reviews = list(movie_reviews['review'])
reviews = remove_duplicates(reviews)

In [65]:
len(reviews)

49582

In [66]:
movie_synopses = pd.read_csv(r'movies_metadata.csv', low_memory=False).filter(['overview'])

synopses = list(movie_synopses['overview'])
synopses = remove_duplicates(synopses)

In [67]:
reviews = list(filter(lambda x: not pd.isna(x), reviews))
synopses = list(filter(lambda x: not pd.isna(x), synopses))

In [68]:
print(len(reviews), len(synopses))

49582 44307


In [69]:
# down sample reviews
reviews = reviews[:len(synopses)]

In [70]:
len(reviews) == len(synopses), len(reviews), len(synopses)

(True, 44307, 44307)

In [71]:
data = reviews + synopses
len(data), len(data)/2, len(synopses), len(reviews)

(88614, 44307.0, 44307, 44307)

In [72]:
# reduce no. characters to 240 -- like Twitter, & speeds up training
data = [text[:240] for text in data]
len(data)

88614

In [73]:
# size = len(data)
labeled_reviews = [(reviews[i][:240], 'Subjective') for i in range(len(reviews))]
labeled_synopses = [(synopses[i][:240], 'Objective') for i in range(len(synopses))]

In [74]:
labeled_reviews = remove_duplicates(labeled_reviews)
labeled_synopses = remove_duplicates(labeled_synopses)

In [75]:
len(labeled_reviews), len(labeled_synopses)

(44285, 44303)

In [76]:
labeled_synopses = labeled_synopses[:len(labeled_reviews)]

In [77]:
labeled_data = labeled_reviews + labeled_synopses
len(labeled_data)

88570

In [18]:
# size = len(data)
# labeled_data = []

# for i in range(size):
#     if i < size/2:
#         labeled_data.append((data[i], 'Subjective'))
#     else:
#         labeled_data.append((data[i], 'Objective'))

In [78]:
labeled_data = remove_duplicates(labeled_data)
len(labeled_data)

88570

In [79]:
from random import shuffle
shuffle(labeled_data)

In [80]:
df = pd.DataFrame(labeled_data)

In [83]:
df.head()

Unnamed: 0,0,1
0,An investigative expose of the inner workings ...,Objective
1,Impoverished Bill befriends starving Trina and...,Objective
2,Thirteen-year-old Kerry is repeatedly sexually...,Objective
3,This is another film where the cinematography ...,Subjective
4,"MR. BONES 2: BACK FROM THE PAST, is the story ...",Objective


In [84]:
df.tail()

Unnamed: 0,0,1
88565,"Kim and Ron start out a new school year, only ...",Objective
88566,Four old school friends reunite to attempt the...,Objective
88567,Daisy and Viola are siamese twin sisters on th...,Objective
88568,Yogi Bear and his pal Boo Boo are shipped off ...,Objective
88569,This film is what most of the industry has for...,Subjective


In [85]:
df = df.rename(columns={0:'Text', 1:'Labels'})

In [86]:
df.head()

Unnamed: 0,Text,Labels
0,An investigative expose of the inner workings ...,Objective
1,Impoverished Bill befriends starving Trina and...,Objective
2,Thirteen-year-old Kerry is repeatedly sexually...,Objective
3,This is another film where the cinematography ...,Subjective
4,"MR. BONES 2: BACK FROM THE PAST, is the story ...",Objective


In [87]:
df['Text'].shape

(88570,)

In [29]:
import numpy as np
from sklearn.model_selection import train_test_split

In [30]:
# split data into train, validtion, and test sets

X_train, X_rem, y_train, y_rem = train_test_split(df['Text'], df['Labels'], train_size = 0.8, random_state = 24)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size = 0.5, random_state = 24)

In [123]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

In [124]:
# use roberta base sentiment model
rbs_model = f'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(rbs_model)
model = AutoModelForSequenceClassification.from_pretrained(rbs_model)

Classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [93]:
model = AutoModelForSequenceClassification.from_pretrained(rbs_model)
model.save_pretrained(rbs_model)
tokenizer.save_pretrained(rbs_model)

('cardiffnlp/twitter-roberta-base-sentiment/tokenizer_config.json',
 'cardiffnlp/twitter-roberta-base-sentiment/special_tokens_map.json',
 'cardiffnlp/twitter-roberta-base-sentiment/vocab.json',
 'cardiffnlp/twitter-roberta-base-sentiment/merges.txt',
 'cardiffnlp/twitter-roberta-base-sentiment/added_tokens.json',
 'cardiffnlp/twitter-roberta-base-sentiment/tokenizer.json')

In [138]:
from torch.nn.functional import softmax

labels = ['negative', 'neutral', 'positive']

with torch.no_grad():
    encoded_text_batch = tokenizer(list(df['Text'][:5]), 
                               padding=True, 
                               truncation=True,
                               max_length=512,
                               return_tensors='pt')

    outputs = model(**encoded_text_batch)
    
    prediction_tensors = softmax(outputs.logits, dim=1)
    print(prediction_tensors)

    sentiments = [labels[torch.argmax(predictions[i])] for i in range(len(predictions))]
    print(sentiments)

tensor([[0.0818, 0.8292, 0.0890],
        [0.0436, 0.6712, 0.2852],
        [0.8320, 0.1627, 0.0053],
        [0.4055, 0.4073, 0.1873],
        [0.5407, 0.4361, 0.0232]])
['neutral', 'neutral', 'negative', 'neutral', 'negative']


In [105]:
list(df['Text'][:5])

['An investigative expose of the inner workings inside the commercial pet food industry, which has went largely unchallenged until now.',
 'Impoverished Bill befriends starving Trina and takes her into his shantytown cabin and looks after her, but making clear there are no strings attached and that he remains entirely free.',
 'Thirteen-year-old Kerry is repeatedly sexually abused by several adults, including at one point her mother. Her father sets her up as a prostitute. Kerry finally calls Childline and is put in a safe house, where she tries to come to terms w',
 'This is another film where the cinematography is the best thing to recommend it. That would be fine if the film were a travelogue, but as a dramatic exercise in cinematic artistry, that is not good enough. The theme of inter-species respect',
 'MR. BONES 2: BACK FROM THE PAST, is the story of Hekule, the King of Kuvukiland who is given a gemstone by the dying Kunji Balanadin. The stone is cursed and causes Hekule to becom