# Movie Rationale

Paper: https://aclanthology.org/N07-1033/
Dataset: https://huggingface.co/datasets/movie_rationales

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import os
from os import path

import sys
sys.path.append("./../src")

cache_path = path.join(os.getcwd(), '..', '.cache')
dataset_path = path.join(cache_path, 'dataset')
tmp_path = path.join('.cache', '2022-08-19')
os.makedirs(tmp_path,exist_ok=True)

In [11]:
from datasets import load_dataset
import pandas as pd

if path.exists('movie_rationales.parquet'):
    df_train = pd.read_parquet('movie_rationales.parquet')
else:
    dataset = load_dataset("movie_rationales", cache_dir=path.join(cache_path, 'dataset'))
    df_train = dataset['train'].to_pandas()
    df_train.to_parquet('movie_rationales.parquet')
    
display(df_train.describe())
display(df_train.head())

Unnamed: 0,label
count,1600.0
mean,0.5
std,0.500156
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


Unnamed: 0,review,label,evidences
0,"plot : two teen couples go to a church party ,...",0,"[mind - fuck movie, the sad part is, downshift..."
1,the happy bastard 's quick movie review damn\n...,0,"[it 's pretty much a sunken ship, sutherland i..."
2,it is movies like these that make a jaded movi...,0,[the characters and acting is nothing spectacu...
3,""" quest for camelot "" is warner bros . '\nfirs...",0,"[dead on arrival, the characters stink, subpar..."
4,synopsis : a mentally unstable man undergoing ...,0,"[it is highly derivative and somewhat boring, ..."


In [12]:
df_train['evidences']

0       [mind - fuck movie, the sad part is, downshift...
1       [it 's pretty much a sunken ship, sutherland i...
2       [the characters and acting is nothing spectacu...
3       [dead on arrival, the characters stink, subpar...
4       [it is highly derivative and somewhat boring, ...
                              ...                        
1595    [emerging as the definite face of independent ...
1596    [a visual and aural treat , backed up with a g...
1597    [this movie is so infectious . and funny ! and...
1598                   [powerfully done, an eye - opener]
1599    [a thoughtful human story, hands down , the mo...
Name: evidences, Length: 1600, dtype: object

In [13]:
review = df_train.review[0]
evidences = df_train.evidences[0]

In [15]:
review.find(evidences[0])

273

In [20]:
len(evidences[0])

17

In [21]:
review[273:273+17]

'mind - fuck movie'

In [22]:
evidences[0]

'mind - fuck movie'

In [23]:
import spacy
sm = spacy.load('en_core_web_sm')

In [26]:
tokens = list(sm(review))

In [29]:
for tk in tokens[:5]:
    print(tk.text, tk.idx)

plot 0
: 5
two 7
teen 11
couples 16


In [38]:
df_train = df_train[:5]

In [39]:
docs = list(sm.pipe(df_train['review']))

In [40]:
df_train['tokens'] = docs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['tokens'] = docs


In [48]:
for i in range(len(docs)):
    df_train.loc[i, 'tokens'] = [tk.text for tk in docs[i]]

ValueError: Must have equal len keys and value when setting with an iterable

In [133]:
df_train['tokens'] = df_train['review'].apply(lambda x: [tk.text for tk in sm(x)])
df_train['token_offset'] = df_train['review'].apply(lambda x: [tk.idx for tk in sm(x)])
df_train['rationale_offset'] = df_train.apply(lambda x: [(x.review.find(evidence), len(evidence)) for evidence in x['evidences']], axis=1)
df_train['rationale_offset'] = df_train.rationale_offset.apply(lambda x: [(r[0], r[0] + r[1]) for r in x])
df_train

Unnamed: 0,review,label,evidences,tokens,token_offet,rationale_offset,check_rational_offset,token_offset
0,"plot : two teen couples go to a church party ,...",0,"[mind - fuck movie, the sad part is, downshift...","[plot, :, two, teen, couples, go, to, a, churc...","[0, 5, 7, 11, 16, 24, 27, 30, 32, 39, 45, 47, ...","[(273, 290), (1749, 1764), (907, 945), (1453, ...","[mind - fuck movie, the sad part is, downshift...","[0, 5, 7, 11, 16, 24, 27, 30, 32, 39, 45, 47, ..."
1,the happy bastard 's quick movie review damn\n...,0,"[it 's pretty much a sunken ship, sutherland i...","[the, happy, bastard, 's, quick, movie, review...","[0, 4, 10, 18, 21, 27, 33, 40, 44, 45, 50, 54,...","[(1325, 1356), (1010, 1030), (449, 471), (879,...","[it 's pretty much a sunken ship, sutherland i...","[0, 4, 10, 18, 21, 27, 33, 40, 44, 45, 50, 54,..."
2,it is movies like these that make a jaded movi...,0,[the characters and acting is nothing spectacu...,"[it, is, movies, like, these, that, make, a, j...","[0, 3, 6, 13, 18, 24, 29, 34, 36, 42, 48, 55, ...","[(1135, 1183), (1270, 1290), (761, 786), (1201...",[the characters and acting is nothing spectacu...,"[0, 3, 6, 13, 18, 24, 29, 34, 36, 42, 48, 55, ..."
3,""" quest for camelot "" is warner bros . '\nfirs...",0,"[dead on arrival, the characters stink, subpar...","["", quest, for, camelot, "", is, warner, bros, ...","[0, 2, 8, 12, 20, 22, 25, 32, 37, 39, 40, 41, ...","[(546, 561), (1910, 1930), (1698, 1787), (2861...","[dead on arrival, the characters stink, subpar...","[0, 2, 8, 12, 20, 22, 25, 32, 37, 39, 40, 41, ..."
4,synopsis : a mentally unstable man undergoing ...,0,"[it is highly derivative and somewhat boring, ...","[synopsis, :, a, mentally, unstable, man, unde...","[0, 9, 11, 13, 22, 31, 35, 46, 60, 66, 68, 72,...","[(4105, 4148), (1131, 1161), (1017, 1120), (19...","[it is highly derivative and somewhat boring, ...","[0, 9, 11, 13, 22, 31, 35, 46, 60, 66, 68, 72,..."


In [134]:
def checking(x):
    rationale = list()
    for r in x['rationale_offset']:
        rationale.append()
    return rationale

df_train['check_rational_offset'] = df_train.apply(lambda x: [x.review[r[0]:r[1]] for r in x['rationale_offset']], axis=1)
df_train

Unnamed: 0,review,label,evidences,tokens,token_offet,rationale_offset,check_rational_offset,token_offset
0,"plot : two teen couples go to a church party ,...",0,"[mind - fuck movie, the sad part is, downshift...","[plot, :, two, teen, couples, go, to, a, churc...","[0, 5, 7, 11, 16, 24, 27, 30, 32, 39, 45, 47, ...","[(273, 290), (1749, 1764), (907, 945), (1453, ...","[mind - fuck movie, the sad part is, downshift...","[0, 5, 7, 11, 16, 24, 27, 30, 32, 39, 45, 47, ..."
1,the happy bastard 's quick movie review damn\n...,0,"[it 's pretty much a sunken ship, sutherland i...","[the, happy, bastard, 's, quick, movie, review...","[0, 4, 10, 18, 21, 27, 33, 40, 44, 45, 50, 54,...","[(1325, 1356), (1010, 1030), (449, 471), (879,...","[it 's pretty much a sunken ship, sutherland i...","[0, 4, 10, 18, 21, 27, 33, 40, 44, 45, 50, 54,..."
2,it is movies like these that make a jaded movi...,0,[the characters and acting is nothing spectacu...,"[it, is, movies, like, these, that, make, a, j...","[0, 3, 6, 13, 18, 24, 29, 34, 36, 42, 48, 55, ...","[(1135, 1183), (1270, 1290), (761, 786), (1201...",[the characters and acting is nothing spectacu...,"[0, 3, 6, 13, 18, 24, 29, 34, 36, 42, 48, 55, ..."
3,""" quest for camelot "" is warner bros . '\nfirs...",0,"[dead on arrival, the characters stink, subpar...","["", quest, for, camelot, "", is, warner, bros, ...","[0, 2, 8, 12, 20, 22, 25, 32, 37, 39, 40, 41, ...","[(546, 561), (1910, 1930), (1698, 1787), (2861...","[dead on arrival, the characters stink, subpar...","[0, 2, 8, 12, 20, 22, 25, 32, 37, 39, 40, 41, ..."
4,synopsis : a mentally unstable man undergoing ...,0,"[it is highly derivative and somewhat boring, ...","[synopsis, :, a, mentally, unstable, man, unde...","[0, 9, 11, 13, 22, 31, 35, 46, 60, 66, 68, 72,...","[(4105, 4148), (1131, 1161), (1017, 1120), (19...","[it is highly derivative and somewhat boring, ...","[0, 9, 11, 13, 22, 31, 35, 46, 60, 66, 68, 72,..."


In [136]:
(df_train['evidences'] == df_train['check_rational_offset']).all()

True

In [139]:
def binarize_rationale(row):
    rationales = row.rationale_offset.copy()
    r_start, r_end = rationales.pop(0)
    rationale_mask = list()
    for token_offset in row['token_offset']:
        rationale_mask.append(r_start <= token_offset <= r_end)
        if r_end <= token_offset and len(rationales) > 0:
            r_start, r_end = rationales.pop(0)
    return rationale_mask
    
df_train['rationale'] = df_train.apply(binarize_rationale, axis=1)

In [147]:
print('check the rationale map cohenrent with the tokens')
(df_train['rationale'].str.len() == df_train['tokens'].str.len()).all()

check the rationale map cohenrent with the tokens


True

In [143]:
sum(df_train['rationale'][0])

14