# 1. Import libraries

In [5]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [6]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

# 2. Load "Key events" page

In [9]:
# Load the book

with open('Key_events.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [10]:
events = NER(data)

# 3. Get named entity list per sentence

In [16]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in events.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [17]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Navigation\..."
1,"(Overlord1.4.6Final, days1.4.7The, war, in, th...","[Holocaust1.4.12The Nuclear Age begins1.5The, ..."
2,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
3,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
4,"(Historic, events, in, the, 20th, century[edit...","[the 20th, Edwardian, the 20th century]"
5,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
6,"(1914, saw, the, completion, of, the, Panama, ...","[1914, the Panama Canal]"
7,"(The, Scramble, for, Africa, continued, in, th...","[Scramble, Africa, the 1900s]"
8,"(The, atrocities, in, the, Congo, Free, State,...",[the Congo Free State]
9,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"


# 4. Load country names

In [20]:
# Import characters

character_df = pd.read_csv("Countries.csv", index_col = 0)

In [21]:
character_df.head()

Unnamed: 0,Country
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


# 5. Filtering entities from the page

In [22]:
# Function to filter out entities not of interest

def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list 
            if ent in list(character_df['Country'])]

In [23]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, character_df))

In [24]:
df_sentences['country_entities'].head(20)

0                    []
1                    []
2                    []
3                    []
4                    []
5                    []
6                    []
7                    []
8                    []
9                    []
10                   []
11                   []
12                   []
13     [France, Russia]
14    [Germany, Russia]
15            [Germany]
16            [Germany]
17                   []
18                   []
19                   []
Name: country_entities, dtype: object

In [25]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [26]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
921,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition between In...",[Pakistan]
928,"("", The, Philippines, ,, 1898–1946, |, US, Hou...","[Philippines, 1898–1946, US House of Represent...",[Philippines]
953,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Postcolonial Borders,...",[Afghanistan]
983,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
993,"(Now, ,, North, Korea, may, be, the, one, true...","[North Korea, one]",[North Korea]
1027,"("", Selling, ', Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1050,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1262,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1266,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1267,"(Singapore, :, World, Scientific, ., doi:10.11...","[Singapore, World Scientific]",[Singapore]


# 6. Creating relationships

In [28]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [29]:
relationship_df = pd.DataFrame(relationships)

In [30]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Russia
4,France,Russia
...,...,...
675,India,Singapore
676,India,Singapore
677,India,Singapore
678,India,Singapore


In [31]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Germany,Russia
3,Germany,Russia
4,France,Russia


In [32]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [33]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,21
2,Germany,Italy,26
3,Austria,Germany,11
4,Germany,Spain,3
5,France,Spain,1
6,France,Poland,15
7,France,Germany,19
8,Germany,Poland,24
9,Poland,Soviet Union,8


# 7. Exporting the file

In [34]:
relationship_df.to_csv('country_relationship.csv')