# Death of a Character Analysis

Here we give each word from the corpus a risk score based on whether a charater who utters the word survives the film. This is then used to derive cumulative risk scores for sentences.

In [1]:
import re

import pandas as pd
import numpy as np

In [2]:
data_path = 'data/{}'

deaths_data = data_path.format('death_list.txt')
corpus_data = data_path.format('movie_lines.txt')


In [3]:
death_list = pd.read_csv(deaths_data)
death_list

Unnamed: 0,film_id,dead_characters
0,m148,"MARGE,TINA,ROD,GLEN"
1,m507,"CASEY,MR. PRESCOTT,TATUM,KENNY,STU,BILLY"
2,m213,"GARRY,PALMER,DR. COPPER,NORRIS,CLARK,BENNINGS,..."
3,m87,"FRANK,PRUDHOE,1ST MAN,LARRY,JULIA"
4,m421,"JONES,WILLIE,COBB,DOC,DEJESUS,BOWMAN,SIXPACK,W..."
5,m158,"LOUIS,JUD,RACHEL,PASCOW"
6,m470,"DREW,GUS,MARJORIE,RENEE"
7,m379,"ANNIE,LYNDA,BOB,SISTER"
8,m514,"JACK,HALLORAN,GRADY"
9,m17,"DAVID,ALEX,JACK,LT. VILLIERS,BRINGSLY"


In [4]:
header_columns = ['lineID', 'characterID', 'movieID', 'character name', 'TextOfTheUtterance']

corpus = pd.read_csv(
    corpus_data, 
    engine='python', 
    sep=' \+{3}\$\+{3} ', 
    header=None,
    names=header_columns
)

In [5]:
corpus.head()

Unnamed: 0,lineID,characterID,movieID,character name,TextOfTheUtterance
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [6]:
def determine_death(row, list_dead):
    if row['character name'] in list_dead:
        return 1
    return 0


def dead_characters(film):
    """ Retrieves the list of characters that died in a film
    
    :param str film: The film ID.
    :return: List[str] of characters.
    """
    query = death_list['film_id'] == film
    
    return death_list.loc[query, 'dead_characters'].iloc[0].split(',')


def grab_results(corpus, film):
        film_corpus = corpus[corpus['movieID'] == film]
        
        list_dead = dead_characters(film)
        
        film_corpus['died'] = film_corpus.apply(determine_death, args=(list_dead,), axis=1)

        film_corpus['TextOfTheUtterance'] = film_corpus['TextOfTheUtterance'].str.findall(r'\w+')
        
        longframe = pd.DataFrame([
            (row.died, word.upper())
            for row in film_corpus.itertuples()
            for word in row.TextOfTheUtterance],
            columns=['score', 'word']
        )
        
        score = longframe.groupby('word')['score'].sum()
        count = longframe['word'].value_counts()
        
        result = pd.DataFrame(
            {
                'score': score, 
                'count': count, 
                'normalized_score': score/count
            }
        )
        
        result = result.reset_index().rename(columns={'index': 'word'})
          
        return result

In [7]:
dead_characters('m470')

['DREW', 'GUS', 'MARJORIE', 'RENEE']

In [8]:
films = [
    'm148',
    'm507',
    'm213',
    'm87',
    'm421',
    'm158',
    'm470',
    'm379',
    'm514',
    'm17'
]

In [9]:
scored_films_list = []
for film in films:
    scored_films_list.append(grab_results(corpus, film))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
scored_films = pd.concat(scored_films_list)

In [11]:
score = scored_films.groupby('word')['score'].sum()
count = scored_films.groupby('word')['count'].sum()

In [12]:
result = pd.DataFrame(
    {
        'score': score,
        'count': count,
        'normalized_score': score/count,

    }
)
result = result.reset_index().rename(columns={'index': 'word'})

In [13]:
result.tail()

Unnamed: 0,word,count,normalized_score,score
3552,ZENDA,1,1.0,1
3553,ZOMBIE,1,1.0,1
3554,ZOMBIES,1,1.0,1
3555,ZOO,7,0.857143,6
3556,ZOWIE,10,0.6,6


In [14]:
parse_sentence = lambda x: re.findall(r'\w+', x.upper())

In [15]:
def score_lookup(word):
    if word in list(result['word']):
        query = result['word'] == word
    
        score = result.loc[query, 'score'].iloc[0]
        count = result.loc[query, 'count'].iloc[0]
    
        return (score, count)
    return (0, 0)

In [16]:
def evaluate(sentence):
    evaluated = [
        sum(m) 
        for m in zip(
            *[score_lookup(n) 
              for n in sentence]
        )
    ]
    
    scored, counted = evaluated
    return {'score': scored, 'count': counted, 'normalized_score': scored/float(counted)}

In [17]:
examine_string = 'Have you seen my dog?'
evaluate(parse_sentence(examine_string))

{'count': 1935, 'normalized_score': 0.54677002583979328, 'score': 1058}

In [18]:
examine_string = "Don't go in there!"
evaluate(parse_sentence(examine_string))

{'count': 1466, 'normalized_score': 0.53751705320600274, 'score': 788}

In [19]:
examine_string = "I haven't seen him!"
evaluate(parse_sentence(examine_string))

{'count': 2182, 'normalized_score': 0.56141154903758017, 'score': 1225}

In [20]:
examine_string = "Help I am lost!"
evaluate(parse_sentence(examine_string))

{'count': 1488, 'normalized_score': 0.5786290322580645, 'score': 861}

In [21]:
examine_string = "We are safe"
evaluate(parse_sentence(examine_string))

{'count': 499, 'normalized_score': 0.52505010020040077, 'score': 262}

In [22]:
examine_string = "I think we are in trouble"
evaluate(parse_sentence(examine_string))

{'count': 2422, 'normalized_score': 0.56028075970272506, 'score': 1357}

In [23]:
examine_string = "I don't think he will catch us."
evaluate(parse_sentence(examine_string))

{'count': 2854, 'normalized_score': 0.56412053258584438, 'score': 1610}

In [24]:
examine_string = "See you later"
evaluate(parse_sentence(examine_string))

{'count': 1617, 'normalized_score': 0.54854669140383427, 'score': 887}