## Install Libraries and Data

In [337]:
# Install Libraries
import pandas as pd
import numpy as np 
import spacy 
from spacy import displacy
NER = spacy.load('en_core_web_sm')
import matplotlib.pyplot as plt
import scipy 
import re

In [404]:
# Import Files
with open('../Data/key_events_20th.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '') 

In [406]:
countries = pd.read_csv(r'../Data/countries.csv', index_col=False)

### Create NER Object

In [408]:
# Create NER obj
country = NER(data)

In [342]:
# Visualise sample entity
displacy.render(country[200:400], style='ent', jupyter=True)

### Split Sentence Entities

In [409]:
# Create list with sentences
sentence = [] # Create empty list

# Loop sentences
for sent in country.sents:
    entity_list = [ent.text for ent in sent.ents]
    sentence.append({'sentence':sent, 'entity':entity_list})

In [411]:
# Create dataframe
sentence = pd.DataFrame(sentence)

In [349]:
# Check dataframe
sentence.head(10)

Unnamed: 0,sentence,entity
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Contribute,..."
1,"(depression1.2.2The, rise, of, dictatorship1.3...","[World War II, Pacific1.3.7.1Background1.3.8Ja..."
2,"(begins1.4The, post, -, war, world1.4.1The, en...","[Cold War, 1947–1991)1.4.3War]"
3,"(race1.4.5The, end, of, the, Cold, War1.4.6Inf...","[the Cold War1.4.6Information, 20th, pageGet, ..."
4,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
5,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
6,"(Historic, events, in, the, 20th, century[edit...",[the 20th]
7,"(Edwardian, eraThe, new, beginning, of, the, 2...","[Edwardian, the 20th century]"
8,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
9,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"


In [414]:
# Filter using country list
def filter_entity(ent_list, countries):
    return[ent for ent in ent_list
          if ent in list(countries['country_name'])]

In [416]:
# Apply filter
sentence['c_entities'] = sentence['entity'].apply(lambda x: filter_entity(x, countries))

In [418]:
# Filter out sentences without countries
sentence_filter = sentence[sentence['c_entities'].map(len)>0]

In [369]:
sentence_filter.head(10)

Unnamed: 0,sentence,entity,c_entities
14,"(Interwoven, alliances, ,, an, increasing, arm...","[Europe, Allies, The Triple Entente, British E...","[France, Russia]"
15,"(Germany, ,, Austria, -, Hungary, ,, Bulgaria,...","[Germany, Austria, Hungary, Bulgaria, the Otto...","[Germany, Austria, Hungary, Bulgaria, Russia]"
16,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[the Treaty of Brest-Litovsk, Germany, Russia]","[Germany, Russia]"
17,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
18,"(It, also, recognized, the, independence, of, ...","[Germany, Allied, American]",[Germany]
24,"(Many, Germans, felt, these, reparations, were...","[Germans, Germany, Allied, Kaiser, Europe]",[Germany]
41,"(Germany, ,, 1933Fascism, first, appeared, in,...","[Germany, first, Italy, Benito Mussolini]","[Germany, Italy]"
42,"(The, ideology, was, supported, by, a, large, ...","[Adolf Hitler, Germany, 1933, Nazism, Germany,...","[Germany, Germany]"
43,"(The, Nazi, Party, in, Germany, was, dedicated...","[The Nazi Party, Germany, German, German, Cent...","[Germany, Germany]"
44,"(Antisemitism, during, the, Great, Depression,...","[the Great Depression, Jews, Austria, Anschlus...","[Austria, Austria, Germany]"


### Create Relationships

In [420]:
# Define window size and initialize relationships list
window_size = 5  # This defines how many sentences will be looked at simultaneously
relationships = []  # Create an empty list to store relationships

for i in range(sentence_filter.index[-1]):
    # Calculate the end index based on the window size
    end_i = min(i + window_size, sentence_filter.index[-1])
    
    # Create a combined list of character entities within the window range
    char_list = sum((sentence_filter.loc[i:end_i, 'c_entities']), [])

    # Remove duplicated characters that are next to each other
    char_unique = [char_list[j] for j in range(len(char_list)) if (j == 0) or char_list[j] != char_list[j - 1]]

    # Append relationships if there is more than one unique character
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [422]:
# Create dataframe
relationships = pd.DataFrame(relationships)

In [424]:
# Sort the cases with a- >b and b- >a
relationships = pd.DataFrame(np.sort(relationships.values, axis = 1), columns = relationships.columns)

In [426]:
# Check relationships dataframe
relationships.head(5)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Germany,Russia
3,Austria,Germany
4,Austria,Hungary


In [428]:
# Add value col.
relationships["value"] = 1
relationships = relationships.groupby(["source","target"], sort=False, as_index=False).sum()

In [430]:
# Check relationships database with new col. 
relationships.head(5)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,26
2,Austria,Germany,17
3,Austria,Hungary,6
4,Bulgaria,Hungary,6


## Export File

In [433]:
# Export dataframe
relationships.to_pickle(r'../Data/relationships.pkl')