In [1]:
#pip install en_core_web_sm

In [None]:
#pip install openpyxl

In [None]:
#!python -m spacy download en_core_web_sm

In [1]:
import spacy
import os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
from spacy.matcher import Matcher

In [2]:
ELM = spacy.load('en_core_web_sm')

In [3]:
print(ELM)

<spacy.lang.en.English object at 0x000002148E6320E0>


In [4]:
texto = open(file= 'data/jane-austen-pride-prejudice.txt').read()
doc_texto = ELM(texto)

In [12]:
data_characters = pd.read_excel('data/characters.xlsx')

In [13]:
data_characters.head()

Unnamed: 0,Characters,Firstname,Nickname,Nickname2,real_name
0,Mr. Bennet,Mr. Bennet,Bennet,,Mr. Bennet/Mrs.Bennet
1,Mrs. Bennet,Mrs. Bennet,Bennet,,Mr. Bennet/Mrs.Bennet
2,Jane Bennet,Jane,,,Jane Bennet
3,Miss Elizabeth Bennet,Elizabeth,Eliza,Lizzy,Elizabeth
4,Mary Bennet,Mary,,,Mary Bennet


In [14]:
spacy.displacy.render(doc_texto[0:150],style = "ent",jupyter = True)

## configura a busca de entidades

In [15]:
matcher = Matcher(ELM.vocab)
pattern = [{'POS':'PROPN'},{'POS':'PROPN'}]
matcher.add('regra a',[pattern])
matches = matcher(doc_texto)

Percorre as entidades de cria uma lista com as entidades localizadas + a frase em que foi localizada a entidade

In [18]:
sent_entity_df = []
for i in doc_texto.sents:
    entiti_list = [i.text for i in i.ents if i.label_ == "PERSON"]
    sent_entity_df.append({"sentence": i , "entities": entiti_list#,"matches" :match_text_list}
                          })
#Cria um dataframe com as entidades localizadas.
sent_entity_df = pd.DataFrame(sent_entity_df)

In [19]:
sent_entity_df.head(20)

Unnamed: 0,sentence,entities
0,"(﻿CHAPTER, I., \n\n)",[﻿CHAPTER I.\n\n]
1,"(It, is, a, truth, universally, acknowledged, ...",[]
2,"(However, little, known, the, feelings, or, vi...",[]
3,"("", My, dear, Mr., Bennet, ,, "", said, his, la...",[Bennet]
4,"("", \n\n, Mr., Bennet, replied, that, he, had,...",[Bennet]
5,"("", But, it, is, ,, "", returned, she, ;, "", fo...",[Long]
6,"("", \n\n, Mr., Bennet, made, no, answer, ., \n\n)",[Bennet]
7,"("", Do, not, you, want, to, know, who, has, ta...",[]
8,"("", You, want, to, tell, me, ,, and, I, have, ...",[]
9,"("", \n\n, This, was, invitation, enough, ., \n\n)",[]


In [21]:
#Filtra apenas as entidades existentes na lista de personagens
def filter_entity(ent_list,character_df):
    return[ent for ent in ent_list
           if ent in list(character_df.Characters)
           or ent in list(character_df.Firstname)
           or ent in list(character_df.Nickname)
           or ent in list(character_df.Nickname2)]

In [22]:
#Filtro para buscar o nome real do personagem
def filter_entity_2(ent_list, character_df):
    if not isinstance(character_df, pd.DataFrame):
        raise ValueError("character_df deve ser um DataFrame do pandas")
    
    real_names = []

    for index, row in character_df.iterrows():
        if any(ent in row[['Characters', 'Firstname', 'Nickname', 'Nickname2']].values for ent in ent_list):
            real_names.append(row['real_name'])
    return real_names

In [23]:
filter_entity_2(["Lady Catherine","Thu","2","Sir Lewis de Bourgh"],data_characters)

['Lewis', 'Lady Catherine']

In [24]:
#Cria uma nova coluna e adiciona seu respectivo nome real
sent_entity_df['characters_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity_2(x,data_characters))
#Mantém no dataframe apenas linhas que possuam correspondencia na coluna "entities"
sent_entity_df_filtered = sent_entity_df[sent_entity_df['characters_entities'].map(len)> 0]

In [25]:
sent_entity_df_filtered.head(15)

Unnamed: 0,sentence,entities,characters_entities
3,"("", My, dear, Mr., Bennet, ,, "", said, his, la...",[Bennet],"[Mr. Bennet/Mrs.Bennet, Mr. Bennet/Mrs.Bennet]"
4,"("", \n\n, Mr., Bennet, replied, that, he, had,...",[Bennet],"[Mr. Bennet/Mrs.Bennet, Mr. Bennet/Mrs.Bennet]"
5,"("", But, it, is, ,, "", returned, she, ;, "", fo...",[Long],[Long]
6,"("", \n\n, Mr., Bennet, made, no, answer, ., \n\n)",[Bennet],"[Mr. Bennet/Mrs.Bennet, Mr. Bennet/Mrs.Bennet]"
10,"("", Why, ,, my, dear, ,, you, must, know, ,, M...","[Long, Morris, Michaelmas]","[Long, Morris]"
16,"("", \n\n, "", My, dear, Mr., Bennet, ,, "", repl...",[Bennet],"[Mr. Bennet/Mrs.Bennet, Mr. Bennet/Mrs.Bennet]"
22,"(You, and, the, girls, may, go, ,, or, you, ma...",[Bingley],[Mr. Bingley]
27,"("", \n\n, "", But, ,, my, dear, ,, you, must, i...",[Bingley],[Mr. Bingley]
31,"(Sir, William, and, Lady, Lucas, are, determin...","[William, Lady Lucas]","[Lady Lucas, William]"
34,"(I, dare, say, Mr., Bingley, will, be, very, \...","[Bingley, Lizzy]","[Elizabeth, Mr. Bingley]"


## criando relações entre entidades

In [21]:
window_size = 5
relationships = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].characters_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [22]:
relation_df = pd.DataFrame(relationships)

In [23]:
relation_df.head(5)

Unnamed: 0,source,target
0,Mr. Bennet/Mrs.Bennet,Long
1,Mr. Bennet/Mrs.Bennet,Long
2,Long,Mr. Bennet/Mrs.Bennet
3,Mr. Bennet/Mrs.Bennet,Long
4,Long,Mr. Bennet/Mrs.Bennet


In [24]:
relation_df["value"] = 1
relationship_df = relation_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [25]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Mr. Bennet/Mrs.Bennet,Long,14
1,Long,Mr. Bennet/Mrs.Bennet,19
2,Long,Morris,6
3,Mr. Bingley,Lady Lucas,23
4,Lady Lucas,William,24
5,William,Elizabeth,44
6,Elizabeth,Mr. Bingley,269
7,Mr. Bingley,Jane Bennet,98
8,Jane Bennet,Elizabeth,641
9,Elizabeth,Mr. Bennet/Mrs.Bennet,184


In [26]:
# Create a graph from a pandas dataframe
G = nx.from_pandas_edgelist(relationship_df, 
                            source = "source", 
                            target = "target", 
                            edge_attr = "value", 
                            create_using = nx.Graph()
                            )
communities = community_louvain.best_partition(G)
nx.set_node_attributes(G,communities,'group')

In [27]:
#plt.figure(figsize=(10,10))
#pos = nx.kamada_kawai_layout(G)
#nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
#plt.show()

In [28]:
from pyvis.network import Network
net = Network(notebook= True, width="1920px",height="1080px",bgcolor='#222222',font_color= "white",neighborhood_highlight=True)
net.barnes_hut(gravity=-800)
net.from_nx(G)
net.show('Pride_and_prejudice.html')

Pride_and_prejudice.html


In [29]:
from pyvis.network import Network
sources = relationship_df['source']
targets = relationship_df['target']
weights = relationship_df['value']
got_net = Network(notebook= True,height="750px", width="100%", bgcolor="#222222", font_color="white")
got_net.barnes_hut()

edge_data = zip(sources, targets, weights)

for e in edge_data:
                src = e[0]
                dst = e[1]
                w = e[2]

                got_net.add_node(src, src, title=src)
                got_net.add_node(dst, dst, title=dst)
                got_net.add_edge(src, dst, value=w)

neighbor_map = got_net.get_adj_list()

# add neighbor data to node hover data
for node in got_net.nodes:
                node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
                node["value"] = len(neighbor_map[node["id"]])

net.show_buttons(filter_=['physics'])
got_net.show("pride.html")

pride.html
