# Cluster Characters

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [None]:
!pip install fuzzywuzzy

In [None]:
!pip install python-Levenshtein

In [16]:
import spacy
from spacy import displacy
import en_core_web_sm

In [3]:
from collections import Counter

In [4]:
import pandas as pd
pd.set_option("max_rows", 400)

In [6]:
import networkx 
from networkx.algorithms.components.connected import connected_components, node_connected_component

In [7]:
import itertools
from fuzzywuzzy import fuzz

In [10]:
#nlp = spacy.load('en_core_web_sm')
nlp = en_core_web_sm.load()

In [12]:
filepath = "../texts/literature/Jones-Lost-in-The-City.txt"

In [14]:
text = open(filepath).read()
document = nlp(text)

# Get People (More Accurately)

Here's a common spaCy NER scenario:

You process your text with spaCy, and you find that the model has correctly tagged a person as a "PERSON." Nice! 🎉

But then, paragraphs later, you notice that spaCy has tagged the exact same person as a different entity — perhaps an organization ("ORG") or a place ("GPE"). Ugh 😫 

To get a more accurate character/people count, we're going to extract all the named entities that spaCy identified as a "PERSON" and then count *any* instance of that entitiy, regardless of its NER label.

Additionally, we're going to output this character list to a CSV file, so we can clean and edit the list by hand (if we wish).

Extract list of all named entities labeled "PERSON":

In [20]:
spacy_identified_people = []

for named_entity in document.ents:
    if named_entity.label_ == "PERSON":
        
        spacy_identified_people.append(named_entity.text)

Output list of identified people to a CSV file for manual cleaning and editing:

In [21]:
pd.DataFrame(Counter(spacy_identified_people).most_common(), columns=['character', 'count']).to_csv('spacy-identified-people.csv', index=False)

After manual editing, re-upload CSV file for accurate list of people:

In [22]:
spacy_identified_people = pd.read_csv('spacy-identified-people.csv')['character'].tolist()

Count any entity that matches a person in our cleaned list of people. Also extract the [index number](https://spacy.io/usage/linguistic-features#named-entities-101) where the person appears in the document.

In [23]:
all_people_matches = []
all_people_matches_plus_ids = []

#Get all entity matches for a previously identified person
for named_entity in document.ents:
    if named_entity.text in spacy_identified_people:
        person = named_entity.text
        
        #Remove apostrophe 's from character name
        person = person.replace("’s", "").strip()
        #Get the character index number from the text
        person_index = named_entity.start_char
        
        all_people_matches.append(person)
        all_people_matches_plus_ids.append([person, person_index])

In [110]:
people_tally = Counter(all_people_matches)
character_df = pd.DataFrame(people_tally.most_common())
character_df.columns = ['character', 'count']

character_df

Unnamed: 0,character,count
0,Cassandra,113
1,Anita,90
2,Joyce,90
3,Caesar,74
4,Madeleine,69
5,Vivian,67
6,Melanie,65
7,Rickey,62
8,Sherman,60
9,Maddie,58


# Cluster Characters By Name Similarity and Distance


spaCy doesn't know that "Betsy Ann Morgan" and "Betsy Ann" should be the same person. So we're also going to pair two character names if they're an extremely close match and they occur within 750 characters of one another.

In [111]:
aggregated_people = []

threshold_distance = 750

#Get all entity matches for a previously identified person
for person, another_person in itertools.combinations(all_people_matches_plus_ids, 2):
        distance = abs(person[1] - another_person[1])
        if distance < threshold_distance:
            
            if person[0] != another_person[0]:
                
                if fuzz.partial_ratio(person[0], another_person[0]) == 100:
                    aggregated_people.append((person[0], another_person[0]))

0:00:01.053097


In [112]:
G=networkx.Graph()
G.add_edges_from(aggregated_people)
people_clusters  = list(connected_components(G))
people_clusters = [sorted(cluster, key=len, reverse=True) for cluster in people_clusters]
people_clusters

[['Betsy Ann Morgan', 'Betsy Ann'],
 ['Miles Patterson', 'Miles'],
 ['Walter Creed', 'Creed'],
 ['Jenny Creed', 'Jenny'],
 ['Robert Morgan', 'Robert'],
 ['Janet Gordon', 'Janet'],
 ['Ralph Holley', 'Ralph'],
 ['LaDeidre Gordon', 'LaDeidre'],
 ['Thelma Holley', 'Thelma'],
 ['Darlene Greenley', 'Darlene'],
 ['Etta O’Connell', 'Etta'],
 ['Mary Keith', 'Mary'],
 ['Blondelle Harris', 'Blondelle'],
 ['Cassandra G. Lewis', 'Cassandra'],
 ['Rhonda Ferguson', 'Rhonda'],
 ['Melanie Cartwright', 'Melanie'],
 ['Anita Hughes', 'Anita'],
 ['Gladys Harper', 'Gladys', 'Glady'],
 ['Joyce Moses', 'Mama Joyce', 'Joyce'],
 ['Pearl Guthrie', 'Pearl Malone', 'Pearl'],
 ['Manny Soto', 'Manny'],
 ['Sherman Wheeler', 'Sherman'],
 ['Sandra Wallington', 'Sandra'],
 ['Angelo Billings', 'Angelo'],
 ['Burberry Carol', 'Carol'],
 ['Lonney McCrae', 'Lonney'],
 ['Brenda Roper', 'Brenda'],
 ['Mabel Smith', 'Mabel'],
 ['Penny Jenkins', 'Jenkins'],
 ['Al Jenkins', 'Al'],
 ['Joy Lambert', 'Joy'],
 ['handcuff Penny', 'Penn

In [113]:
def add_clustered_characters(row):
    character = row
    if any(character in cluster for cluster in people_clusters):
        for cluster in people_clusters:
            if character in cluster:
                return " // ".join(cluster)
    else:
        return character

In [115]:
character_df['clustered_characters'] = character_df['character'].apply(add_clustered_characters)

In [116]:
character_df

Unnamed: 0,character,count,clustered_characters
0,Cassandra,113,Cassandra G. Lewis // Cassandra
1,Anita,90,Anita Hughes // Anita
2,Joyce,90,Joyce Moses // Mama Joyce // Joyce
3,Caesar,74,Caesar
4,Madeleine,69,Madeleine
5,Vivian,67,Vivian // Vi
6,Melanie,65,Melanie Cartwright // Melanie
7,Rickey,62,Rickey Madison // Rickey
8,Sherman,60,Sherman Wheeler // Sherman
9,Maddie,58,Maddie Williams // Aunt Maddie // Maddie


Manually edit

In [38]:
#character_df.to_csv('clustered_characters_draft_Lost.csv')

In [39]:
#character_df = pd.read_csv('clustered_characters_edited.csv')

In [117]:
character_df.groupby('clustered_characters')[['count']].sum().sort_values(by='count', ascending=False).reset_index()

Unnamed: 0,clustered_characters,count
0,Cassandra G. Lewis // Cassandra,114
1,Joyce Moses // Mama Joyce // Joyce,96
2,Anita Hughes // Anita,92
3,Marie Delaveaux Wilson // Marie heard Vernelle...,77
4,Vivian // Vi,75
5,Samuel Lamont Williams // Samuel Williams // S...,75
6,Caesar,74
7,Madeleine,69
8,Melanie Cartwright // Melanie,66
9,Rickey Madison // Rickey,64


# Make a Network of Characters

In [118]:
edge_list = []

threshold_distance = 50

#Get all entity matches for a previously identified person
for person, another_person in itertools.combinations(all_people_matches_plus_ids, 2):
        distance = abs(person[1] - another_person[1])
        if distance < threshold_distance:
            
            if person[0] != another_person[0]:
                
                edge_list.append((person[0], another_person[0]))

In [119]:
character_df = pd.DataFrame(Counter(edge_list).most_common(), columns=['character_pair', 'edge_weight'])
character_df['character1']=character_df['character_pair'].str[0]
character_df['character2']=character_df['character_pair'].str[1]

In [120]:
character_network = character_df[['character1', 'character2', 'edge_weight']]
character_network[:100]

Unnamed: 0,character1,character2,edge_weight
0,Anita,Melanie,12
1,Cassandra,Anita,11
2,Cassandra,Melanie,10
3,Melanie,Cassandra,10
4,Anita,Cassandra,9
5,Marcus,Avis,9
6,Caesar,Sherman,8
7,Sam,Madeleine,8
8,Santiago,Rickey,8
9,Jenny,Robert,7


In [122]:
def add_clustered_characters(row):
    character = row
    if any(character in cluster for cluster in people_clusters):
        for cluster in people_clusters:
            if character in cluster:
                return " // ".join(cluster)
    else:
        return character

In [123]:
character_network['character1'] = character_network['character1'].apply(add_clustered_characters)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [124]:
character_network['character2'] = character_network['character2'].apply(add_clustered_characters)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [130]:
character_network.to_csv('lost-in-the-city-network.csv', index=False, encoding='utf-8')

In [128]:
character_network.groupby(['character1', 'character2'])[['edge_weight']].sum().sort_values(by='edge_weight', ascending=False).reset_index()

Unnamed: 0,character1,character2,edge_weight
0,Samuel Lamont Williams // Samuel Williams // S...,Madeleine,13
1,Anita Hughes // Anita,Melanie Cartwright // Melanie,12
2,Cassandra G. Lewis // Cassandra,Anita Hughes // Anita,11
3,Cassandra G. Lewis // Cassandra,Melanie Cartwright // Melanie,10
4,Melanie Cartwright // Melanie,Cassandra G. Lewis // Cassandra,10
5,Anita Hughes // Anita,Cassandra G. Lewis // Cassandra,10
6,Madeleine,Samuel Lamont Williams // Samuel Williams // S...,9
7,Marcus,Avis,9
8,Arnisa Isaacs // Arnisa,Samuel Lamont Williams // Samuel Williams // S...,9
9,Beatrice Atwell // Beatrice // Bea,Ida Garrett // Garrett,8


In [21]:
filepath = "../texts/literature/Little-Women.txt"

In [22]:
text = open(filepath).read()

In [4]:
chunked_text= text.split('\n')

In [7]:
chunked_text[2]

'                           PLAYING PILGRIMS.'

In [40]:
number_of_chunks = 5000
chunked_text = [text[i:i+number_of_chunks] for i in range(0, len(text), number_of_chunks)]

In [41]:
len(chunked_text)

216

In [140]:
chunked_documents = list(nlp.pipe(chunked_text))

# Get People (More Accurately)

Extract list of all named entities labeled "PERSON":

In [142]:
spacy_identified_people = []

for document in chunked_documents:
    for named_entity in document.ents:
        if named_entity.label_ == "PERSON":
            spacy_identified_people.append(named_entity.text)

Output list of identified people to a CSV file for manual cleaning and editing:

In [21]:
#pd.DataFrame(Counter(spacy_identified_people).most_common(), columns=['character', 'count']).to_csv('spacy-identified-people.csv', index=False)

After manual editing, re-upload CSV file for accurate list of people:

In [22]:
#spacy_identified_people = pd.read_csv('spacy-identified-people.csv')['character'].tolist()

In [205]:
all_people_matches = []
all_people_matches_plus_ids = []
document_length = 0

#Get all entity matches for a previously identified person
for document in chunked_documents:
    document_length += len(document.text)
    for named_entity in document.ents:
        if named_entity.text in spacy_identified_people:
            person = named_entity.text

            #Remove apostrophe 's from character name
            person = person.replace("’s", "").strip()
            #Get the character index number from the text
            
            person_index =  (document_length - named_entity.start_char)

            all_people_matches.append(person)
            all_people_matches_plus_ids.append([person, person_index])

In [207]:
people_tally = Counter(all_people_matches)
character_df = pd.DataFrame(people_tally.most_common())
character_df.columns = ['character', 'count']

character_df

Unnamed: 0,character,count
0,Jo,1296
1,Amy,631
2,Laurie,582
3,Meg,523
4,Beth,463
...,...,...
541,Michael,1
542,Gutenberg-tm,1
543,Project Gutenberg-tm's,1
544,Gregory B. Newby,1


# Make a network!

In [219]:
edge_list = []

threshold_distance = 100

#Get all entity matches for a previously identified person
for person, another_person in itertools.combinations(all_people_matches_plus_ids, 2):
        distance = abs(person[1] - another_person[1])
        if distance < threshold_distance:
            
            if person[0] != another_person[0]:
                
                edge_list.append((person[0], another_person[0]))

In [220]:
character_df = pd.DataFrame(Counter(edge_list).most_common(), columns=['character_pair', 'edge_weight'])
character_df['character1']=character_df['character_pair'].str[0]
character_df['character2']=character_df['character_pair'].str[1]

In [222]:
character_network = character_df[['character1', 'character2', 'edge_weight']]
character_network[character_network['edge_weight'] > 2]

Unnamed: 0,character1,character2,edge_weight
0,Meg,Jo,98
1,Laurie,Jo,97
2,Jo,Laurie,94
3,Jo,Amy,92
4,Jo,Beth,89
5,Amy,Jo,85
6,Beth,Jo,83
7,Jo,Meg,79
8,Beth,Amy,60
9,Amy,Laurie,52


In [223]:
character_network[character_network['edge_weight'] > 2].to_csv('Little-Women-character-network.csv')