# This script contains the following:
## 1. Import libraries and data
## 2. Wrangling
## 3. Create NER object and split sentences
## 4. Filter entities
## 5. Build relationships

### 1. Import libraries and data

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
from collections import Counter

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

print("Libraries imported and spaCy model loaded successfully!")


  import pkg_resources


Libraries imported and spaCy model loaded successfully!


In [3]:
# Check versions to ensure correct setup in the terminal
print("spaCy version:", spacy.__version__)
print("NetworkX version:", nx.__version__)
import matplotlib
print("Matplotlib version:", matplotlib.__version__)
import scipy
print("SciPy version:", scipy.__version__)


spaCy version: 3.4.3
NetworkX version: 2.8.8
Matplotlib version: 3.6.2
SciPy version: 1.9.3


In [4]:
# Load the 20th-century text file

with open("key_events_20th_century.txt", "r", encoding="utf-8") as f:
    text = f.read()


In [5]:
# Quick check
print("Characters in text:", len(text))
print("\n--- Preview ---\n")
print(text[:800])

Characters in text: 108157

--- Preview ---

The
20th century
changed the world in unprecedented ways. The
World Wars
sparked tension between countries and led to the creation of
atomic bombs
, the
Cold War
led to the
Space Race
and the creation of space-based rockets, and the
World Wide Web
was created. These advancements have played a significant role in citizens' lives and shaped the
21st century
into what it is today.
Historic events in the 20th century
World at the beginning of the century
Main article:
Edwardian era
The new beginning of the 20th century marked significant changes. The 1900s saw the decade herald a series of inventions, including the
automobile
,
airplane
and
radio broadcasting
. 1914 saw the completion of the
Panama Canal
.
The
Scramble for Africa
continued in the 1900s and resulted in wars and genocide across 


In [6]:
# Create path
path = r'/Users/ianfleming/20th-century'

In [8]:
# Import cleaned countries dataframe
df_countries = pd.read_csv(os.path.join(path, 'country_mentions.csv'), index_col=0)


In [9]:
# Check import
df_countries.head()

Unnamed: 0_level_0,Count
Country,Unnamed: 1_level_1
germany,38
japan,31
united states,22
france,17
poland,13


### 2. Wrangling

In [10]:
# Clean key_events_20th_century
text_clean = (
    text.replace('\n', ' ')           
         .replace('\r', ' ')         
         .strip()                   
)


In [11]:
# Quick check
print("Characters in cleaned text:", len(text_clean))
print("\nPreview:\n", text_clean[:500])

Characters in cleaned text: 108157

Preview:
 The 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs , the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today. Historic events in the 20th century World at the beginning of the century Main article: Edwardian era The new beginning


The text didn’t need much cleaning. I just removed a few extra line breaks and spaces and made sure everything was in lowercase to match my country list. There weren’t any special characters or strange symbols. After that, I saved the cleaned version as a .txt file to use for the next steps

### 3. Create NER object and split sentences

In [12]:
# Create NER object
doc = nlp(text_clean)

In [13]:
# Split into sentences
sentences = list(doc.sents)

In [14]:
# Check

print("Sentences detected:", len(sentences))
print("\nPreview of first sentence:\n")
print(sentences[0].text[:300])

Sentences detected: 1659

Preview of first sentence:

The 20th century changed the world in unprecedented ways.


In [15]:
# Build a one row per sentence dataframe
df_sentences = []
for s in sentences:
    entity_list = [ent.text for ent in s.ents]  
    df_sentences.append({"sentence": s.text, "entities": entity_list})

In [16]:
# Create dataframe
df_sentences = pd.DataFrame(df_sentences)


In [17]:
# Check
df_sentences.head(5)

Unnamed: 0,sentence,entities
0,The 20th century changed the world in unpreced...,[The 20th century]
1,The World Wars sparked tension between countri...,"[the Cold War, the Space Race]"
2,These advancements have played a significant r...,"[the 21st century, today]"
3,Historic events in the 20th century World at t...,"[the 20th century, the beginning of the centur..."
4,Edwardian era The new beginning of the 20th ce...,"[Edwardian, the 20th century]"


### 4. Filter Entities

In [18]:
def norm(s: str) -> str:
    return re.sub(r"\s+", " ", s.strip().lower())

In [19]:
# Build lookup dict from countries
country_lookup = set(df_countries["Country"].map(norm))

KeyError: 'Country'

In [20]:
print(df_countries.columns)
print(df_countries.head())


Index(['Count'], dtype='object')
               Count
Country             
germany           38
japan             31
united states     22
france            17
poland            13


I can see that "Country" is not a column so I need to set it

In [21]:
# Reset index
df_countries = df_countries.reset_index().rename(columns={"index": "Country"})
print(df_countries.columns)
print(df_countries.head())

Index(['Country', 'Count'], dtype='object')
         Country  Count
0        germany     38
1          japan     31
2  united states     22
3         france     17
4         poland     13


In [22]:
# Build dict
country_lookup = set(df_countries["Country"].map(norm))


In [23]:
def keep_countries(ents):
    return [e for e in ents if norm(e) in country_lookup]

In [24]:
df_sentences["country_entities"] = df_sentences["entities"].apply(keep_countries)

In [25]:
df_sentences_filtered = df_sentences[df_sentences["country_entities"].map(len) > 0].reset_index(drop=True)


In [26]:
print("Sentences with ≥1 country:", len(df_sentences_filtered))

Sentences with ≥1 country: 144


In [27]:
# Check dataframe
df_sentences_filtered.head()

Unnamed: 0,sentence,entities,country_entities
0,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, the Br...","[France, Austria, Hungary]"
1,"In 1917, Russia ended hostile actions against ...","[1917, Russia, the Central Powers, Tsar]",[Russia]
2,The Bolsheviks negotiated the Treaty of Brest-...,"[the Treaty of Brest-Litovsk, Germany, Russia]","[Germany, Russia]"
3,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
4,It also recognized the independence of Ukraine .,[Ukraine],[Ukraine]


In [28]:
# Normalise 
df_sentences_filtered["country_entities"] = (
    df_sentences_filtered["country_entities"]
    .apply(lambda xs: sorted({norm(x) for x in xs}))
)


In [29]:
df_sentences_filtered.head(5)

Unnamed: 0,sentence,entities,country_entities
0,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, the Br...","[austria, france, hungary]"
1,"In 1917, Russia ended hostile actions against ...","[1917, Russia, the Central Powers, Tsar]",[russia]
2,The Bolsheviks negotiated the Treaty of Brest-...,"[the Treaty of Brest-Litovsk, Germany, Russia]","[germany, russia]"
3,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[germany]
4,It also recognized the independence of Ukraine .,[Ukraine],[ukraine]


### 5. Build relationships

In [30]:
# Import library
from itertools import combinations

In [45]:
relationship = []
for _, r in df_sentences_filtered.iterrows():
    cs = r["country_entities"]
    for a, b in combinations(sorted(cs), 2):
        relationship.append((a, b))

df_relationship = (
    pd.DataFrame(relationship, columns=["source","target"])
      .value_counts()
      .reset_index(name="value")
      .sort_values("value", ascending=False)
      .reset_index(drop=True)
)

print("relationship:", len(df_edges))

relationship: 91


In [47]:
# Check 
df_relationship.head()

Unnamed: 0,source,target,value
0,france,germany,4
1,germany,poland,4
2,india,pakistan,3
3,china,japan,3
4,germany,italy,3


In [50]:
# Export 
df_relationship.to_csv(os.path.join(path, 'relationships.csv'))

In [51]:
print("relationships.csv saved successfully")

relationships.csv saved successfully
