In [29]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

import matplotlib.pyplot as plt

In [30]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [31]:
# Load spacy English languague model
NER = spacy.load("en_core_web_sm")

## Load books

In [32]:
import os
 
# Get all book files in the data directory
all_books = [b for b in os.scandir('data') if '.txt' in b.name]

In [33]:
all_books

[<DirEntry '1 The Last Wish.txt'>, <DirEntry 'test_cryptojacking.txt'>]

In [37]:
book = all_books[1]
# book = cryptojacking
book_text = open(book).read()
print(book_text)

The Problem
Amazon Web Services
Security researchers stumbled upon a long-lasting cryptojacking saga called “EleKtra-Leak.” In this cyber odyssey, mischievous culprits clone public GitHub repositories, making off with exposed AWS credentials. Specifically, key pairs are used to access the victim’s AWS accounts. The “clone and exploit” happens within minutes of an unaware victim accidentally posting a key pair. They then unleash a legion of Amazon Elastic Compute Cloud (EC2) instances to mine the cryptocurrency Monero. Researchers witnessed a staggering 474 miners controlled by what they charmingly call “potentially actor-controlled EC2 instances.”

AWS tries to help with its security practices, diligently alerting when credentials have been compromised. If AWS determines a key pair has been compromised (in this case, made public), they try to shut down the crypto criminals by applying the AWSCompromisedKeyQuarantine policy to the customer’s key pair. (Note: If you get an email from AWS

In [38]:
all_entities = NER(book_text)

In [39]:
entity_types_to_remove = ["CARDINAL", "TIME", "DATE", "ORDINAL", "QUANTITY", "PERCENT", "MONEY"]
filtered_entities =[ent.text for ent in all_entities.ents if ent.label_ not in entity_types_to_remove]
print(filtered_entities)

['Problem', 'Amazon Web Services\nSecurity', 'EleKtra-Leak', 'GitHub', 'Amazon Elastic Compute Cloud', 'Monero', 'AWS', 'AWS', 'Cyber', 'GitHub', 'AWS', 'API', 'CloudTrail', 'AWS', 'Monero', 'Google Drive', 'Google Drive', 'Monero', 'AWS', 'Cybersecurity', 'GitHub', 'API', 'EleKtra-Leak', 'Monero', 'GitHub', 'GitHub', 'Settings', 'Settings', 'Security', 'The Recommendation\nEnable', 'The Special Note\nNow', 'GitHub', 'GitHub', 'The Closing\nRemember', 'AI', 'AWS', 'the Next Level', 'Digital Cloud Training', 'Digital Cloud Training', 'Cloud Computing', 'Lambda', 'Boto3\nLinux File System', 'Cloud Computing', 'AMBERSQUID', 'AWS Amplify', 'Amazon SageMaker', 'Amazon Elastic Compute Cloud', 'Amazon', 'AWS', 'Sysdig', 'Sysdig', 'Linux', 'Docker Hub', 'GitHub', 'Indonesia', 'AWS', 'Docker', 'AWS', 'CodeCommit', 'SageMaker', 'Amazon Elastic Container Service', 'Amazon ECS', 'Docker', 'CodeCommit', 'Git', 'AWS', 'Git', 'AWS Amplify', 'Amazon ECS', 'Amazon', 'ECS', 'AWS Fargate', 'GB', 'RAM', '

In [41]:
# Visualize identified entities
displacy.render(all_entities[0:2000], style="ent", jupyter=True)

## Load character names

In [None]:
# Read characters
character_df = pd.read_csv("characters.csv")
print(character_df.gi(10))

In [None]:
# Remove brackets and text within brackets
import re 
character_df['character'] = character_df['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x)) 
character_df['character_firstname'] = character_df['character'].apply(lambda x: x.split(' ', 1)[0])

In [None]:
pd.set_option('display.max_rows', None)
character_df

## Get named entity list per sentence

In [None]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)

In [None]:
glob_entity_list = []

for sent in book_doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    glob_entity_list.extend(entity_list)

print(glob_entity_list)


In [None]:
sent_entity_df

In [None]:
# Function to filter out non-character entities
def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list 
            if ent in list(character_df.character) 
            or ent in list(character_df.character_firstname)]

In [None]:
filter_entity(["Geralt", "Thu", "2"], character_df)

In [None]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

# Filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
sent_entity_df_filtered.head(10)

In [None]:
# Take only first name of characters
sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0] 
                                                                                                               for item in x])

In [None]:
pd.reset_option('^display.', silent=True)
sent_entity_df_filtered

## Create relationships

In [None]:
window_size = 5
relationships = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [None]:
relationship_df = pd.DataFrame(relationships)

In [None]:
pd.set_option('display.max_rows', None)
relationship_df

In [None]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df

In [None]:
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [None]:
relationship_df.head(10)

## Graph analysis and visualization

In [None]:
# Create a graph from a pandas dataframe
G = nx.from_pandas_edgelist(relationship_df, 
                            source = "source", 
                            target = "target", 
                            edge_attr = "value", 
                            create_using = nx.Graph())

#### Graph visualization - Networkx

In [None]:
plt.figure(figsize=(10,10))
pos = nx.kamada_kawai_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

#### Graph visualization - Pyvis

In [None]:
from pyvis.network import Network
net = Network(notebook = True, width="1000px", height="700px", bgcolor='#222222', font_color='white')

node_degree = dict(G.degree)

#Setting up node size attribute
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("witcher.html")

### The most important characters in The Witcher

In [None]:
# Degree centrality
degree_dict = nx.degree_centrality(G)
degree_dict

In [None]:
degree_df = pd.DataFrame.from_dict(degree_dict, orient='index', columns=['centrality'])
# Plot top 10 nodes
degree_df.sort_values('centrality', ascending=False)[0:9].plot(kind="bar")

In [None]:
# Betweenness centrality
betweenness_dict = nx.betweenness_centrality(G)
betweenness_df = pd.DataFrame.from_dict(betweenness_dict, orient='index', columns=['centrality'])
# Plot top 10 nodes
betweenness_df.sort_values('centrality', ascending=False)[0:9].plot(kind="bar")

In [None]:
# Closeness centrality
closeness_dict = nx.closeness_centrality(G)
closeness_df = pd.DataFrame.from_dict(closeness_dict, orient='index', columns=['centrality'])
# Plot top 10 nodes
closeness_df.sort_values('centrality', ascending=False)[0:9].plot(kind="bar")

In [None]:
# Save centrality measures
nx.set_node_attributes(G, degree_dict, 'degree_centrality')
nx.set_node_attributes(G, betweenness_dict, 'betweenness_centrality')
nx.set_node_attributes(G, closeness_dict, 'closeness_centrality')

### Community detection

In [None]:
import community as community_louvain

In [None]:
communities = community_louvain.best_partition(G)

In [None]:
communities

In [None]:
nx.set_node_attributes(G, communities, 'group')

In [None]:
com_net = Network(notebook = True, width="1000px", height="700px", bgcolor='#222222', font_color='white')
com_net.from_nx(G)
com_net.show("witcher_communities.html")

### Evolution of characters' importance

In [None]:
from lib.utils.functions import *

In [None]:
# Initialize empty list for graphs from books
books_graph = []
all_books = [b for b in os.scandir('data') if '.txt' in b.name]
# Load spacy English languague model
nlp = spacy.load("en_core_web_sm")

# Sort dir entries by name
all_books.sort(key=lambda x: x.name)

In [None]:
# Loop through book list and create graphs
for book in all_books:
    book_text = ner(book)
    
    # Get list of entities per sentences
    sent_entity_df = get_ne_list_per_sentence(book_text)
    
    # Select only character entities
    sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

    # Filter out sentences that don't have any character entities
    sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]
    
    # Take only first name of characters
    sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0] 
                                                                                                               for item in x])

    # Create relationship df
    relationship_df = create_relationships(df = sent_entity_df_filtered, window_size = 5)                                                                                                               
    
    # Create a graph from a pandas dataframe
    G = nx.from_pandas_edgelist(relationship_df, 
                                source = "source", 
                                target = "target", 
                                edge_attr = "value", 
                                create_using = nx.Graph())     
    
    books_graph.append(G) 

In [None]:
# Creating a list of degree centrality of all the books
evol = [nx.degree_centrality(book) for book in books_graph]

# Creating a DataFrame from the list of degree centralities in all the books
degree_evol_df = pd.DataFrame.from_records(evol)

# Plotting the degree centrality evolution of 5 main characters
degree_evol_df[["Geralt", "Ciri", "Yennefer", "Dandelion", "Vesemir"]].plot()