In [1]:
import spacy

In [2]:
import re

In [3]:
from collections import Counter

In [4]:
import pandas as pd

In [5]:
import glob

In [6]:
import os

In [7]:
import networkx as nx
from networkx.algorithms import bipartite

In [8]:
pd.set_option("display.max_rows",1000)

In [9]:
pd.set_option("display.max_columns",1000)

Eliminate blank lines

In [10]:
nlp = spacy.load('en')

In [11]:
nlp_mango_model = spacy.load('en-mango')

In [12]:
mango2 = pd.read_csv('Mango-Characters-2.csv', delimiter='\t')

In [13]:
#count characters and organizations and include label
def count_characters_by_match(filepath):
    #open and read file with spacy
    tokens = nlp(open(filepath).read())
    #get the file name
    filename = os.path.split(filepath)[-1].replace(".txt","")
    filename = filename.replace("-"," ")
    story_number = re.match('(^[0-9]+)', filename).group()
    filename = re.sub('(^[0-9]+)', '', filename)
    filename = f'{filename} ({story_number})'
    #get a list of tuples with people/organizations and entity label
    character_list = [person for person in mango2['person']]
    #character_list.append('I')
    #character_list.append('me')
    people = [item.text for item in tokens if item.text in character_list]
    #clean up the people/organization names by getting rid of plurals, linebreaks, and some punctuation
    if len(people) > 0:
        people_counts = Counter(people)

                # make datalist for pandas dataframe
            #datalist = [(filename, people[0], people[1], people_counts) 
                     #           for ((people[0], people[1]), people_counts) in people_counts.items()]
        datalist = [(filename, people_counts[0], people_counts[1], story_number) for people_counts in people_counts.items()]
        tmp = pd.DataFrame(datalist)
        tmp.columns = ['vignette','person','weight', 'story_number'] 
        return tmp
    else:
        return

In [14]:
filepaths = sorted(glob.glob('/Users/melaniewalsh/dissertation/draft/3.Cisneros/other-materials/digital/vignettes/*.txt'))
characters_df_by_match = pd.DataFrame()
for filepath in filepaths:
    characters_df_by_match = characters_df_by_match.append(count_characters_by_match(filepath))

In [15]:
characters_df_by_match = characters_df_by_match.replace('I', 'Esperanza')

In [16]:
characters_df_by_match = characters_df_by_match.replace('me', 'Esperanza')

In [15]:
characters_df_by_match

Unnamed: 0,vignette,person,weight,story_number
0,The House on Mango Street (1),Mama,5,1
1,The House on Mango Street (1),Papa,6,1
2,The House on Mango Street (1),Carlos,2,1
3,The House on Mango Street (1),Kiki,2,1
4,The House on Mango Street (1),Nenny,2,1
0,"Louie, His Cousin & His Other Cousin (10)",Louie,15,10
1,"Louie, His Cousin & His Other Cousin (10)",Marin,3,10
2,"Louie, His Cousin & His Other Cousin (10)",Maris,1,10
0,Marin (11),Marin,11,11
1,Marin (11),Louie,1,11


In [16]:
G = nx.from_pandas_edgelist(characters_df_by_match, source='person', target='vignette', edge_attr='weight')

In [17]:
G.add_nodes_from(characters_df_by_match['person'], bimodal='character')

In [18]:
G.add_nodes_from(characters_df_by_match['vignette'], bimodal='vignette')

In [19]:
G.remove_node('People')

In [52]:
nx.write_gexf(G, 'No-Esperanza-2020-by-match-mango-street-character-network.gexf')

# Unimodal

In [45]:
top_nodes = set(node for node, detail in G.nodes(data=True) if detail['bimodal']=='character')
bottom_nodes = set(G) - top_nodes

In [46]:
U = bipartite.weighted_projected_graph(G, bottom_nodes)

In [47]:
bottom_nodes

{' A Rice Sandwich (18)',
 ' A Smart Cookie (36)',
 ' Alicia & I Talking on Edna’s Steps (42)',
 ' Alicia Who Sees Mice (14)',
 ' And Some More (16)',
 ' Beautiful & Cruel (35)',
 ' Born Bad (23)',
 ' Boys & Girls (3)',
 ' Bums in the Attic (34)',
 ' Cathy Queen of Cats (5)',
 ' Chanclas (19)',
 ' Darius & the Clouds (15)',
 ' Edna’s Ruthie (26)',
 ' Elenita, Cards, Palm, Water (24)',
 ' Four Skinny Trees (29)',
 ' Geraldo No Last Name (25)',
 ' Gil’s Furniture Bought & Sold (8)',
 ' Hairs (2)',
 ' Hips (20)',
 ' Laughter (7)',
 ' Linoleum Roses (40)',
 ' Louie, His Cousin & His Other Cousin (10)',
 ' Mango Says Goodbye Sometimes (44)',
 ' Marin (11)',
 ' Meme Ortiz (9)',
 ' Minerva Writes Poems (33)',
 ' My Name (4)',
 ' No Speak English (30)',
 ' Our Good Day (6)',
 ' Papa Who Wakes Up Tired in the Dark (22)',
 ' Rafaela Who Drinks Coconut & Papaya Juice on Tuesdays (31)',
 ' Red Clowns (39)',
 ' Sally (32)',
 ' Sire (28)',
 ' The Earl of Tennessee (27)',
 ' The Family of Little Feet

In [48]:
nx.write_gexf(U, 'no-Esperanza-2020-by-match-mango-street-character-network-unimodal-vignettes.gexf')

In [49]:
top_nodes = set(node for node, detail in G.nodes(data=True) if detail['bimodal']=='character')
bottom_nodes = set(G) - top_nodes

In [50]:
U = bipartite.weighted_projected_graph(G, top_nodes)

In [51]:
nx.write_gexf(U, 'no-Esperanza-2020-by-match-mango-street-character-network-unimodal-characters.gexf')

In [32]:
SU = nx.Graph([(u,v,d) for u,v,d in U.edges(data=True) if d ['weight']>2] )

In [22]:
#nx.set_node_attributes(G, pd.Series(nodes.story_number, index=nodes.node).to_dict(), 'story_number')