# Importing Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import scipy
import spacy
from spacy import displacy
import re

In [2]:
# Loading Spacy's english module
NER = spacy.load('en_core_web_sm')

## Loading 20th Century article

In [3]:
with open('20th_century_article.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')
data[:500]

"The th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs , the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and shaped the st century into what it is today. The new beginning of the th century marked significant changes. The s saw the decade herald a series of inventions, includi"

## Loading Country List

In [4]:
with open('20th_century_countries.txt', 'r', encoding='utf8', errors='ignore') as file:
    countryData = file.read()
countryData[:500]

"Abkhazia – Republic of Abkhazia\nAfghanistan – Islamic Emirate of Afghanistan\nAlbania – Republic of Albania\nAlgeria – People's Democratic Republic of Algeria\nAndorra – Principality of Andorra\nAngola – Republic of Angola\nAntigua and Barbuda\nArgentina – Argentine Republic [ i ]\nArmenia – Republic of Armenia\nAustralia – Commonwealth of Australia\nAustria – Republic of Austria\nAzerbaijan – Republic of Azerbaijan [ k ]\nBahamas, The – Commonwealth of The Bahamas [ 12 ]\nBahrain – Kingdom of Bahrain\nBangl"

The country data scraped requires the removal of long names that appear after the dash, comma and text within square brackets.

# Cleaning Countries Data

In [5]:
# Removing long names and text witing square brackets
cleaned_countries = re.sub(r' – [^\n]*|,[\s][^\n]*| \[.[^\n]+', '', countryData)
# cleaned_countries = '\n'.join(line.split(' – ')[0] for line in countryData.split('\n'))
cleaned_countries

'Abkhazia\nAfghanistan\nAlbania\nAlgeria\nAndorra\nAngola\nAntigua and Barbuda\nArgentina\nArmenia\nAustralia\nAustria\nAzerbaijan\nBahamas\nBahrain\nBangladesh\nBarbados\nBelarus\nBelgium\nBelize\nBenin\nBhutan\nBolivia\nBosnia and Herzegovina\nBotswana\nBrazil\nBrunei\nBulgaria\nBurkina Faso\nBurundi\nCambodia\nCameroon\nCanada\nCape Verde\nCentral African Republic\nChad\nChile\nChina\nColombia\nComoros\nCongo\nCongo\nCook Islands\nCosta Rica\nCroatia\nCuba\nCyprus\nCzech Republic\nDenmark\nDjibouti\nDominica\nDominican Republic\nEcuador\nEgypt\nEl Salvador\nEquatorial Guinea\nEritrea\nEstonia\nEswatini\nEthiopia\nFiji\nFinland\nFrance\nGabon\nGambia\nGeorgia\nGermany\nGhana\nGreece\nGrenada\nGuatemala\nGuinea\nGuinea-Bissau\nGuyana\nHaiti\nHonduras\nHungary\nIceland\nIndia\nIndonesia\nIran\nIraq\nIreland\nIsrael\nItaly\nIvory Coast\nJamaica\nJapan\nJordan\nKazakhstan\nKenya\nKiribati\nKosovo\nKuwait\nKyrgyzstan\nLaos\nLatvia\nLebanon\nLesotho\nLiberia\nLibya\nLiechtenstein\nLithuani

In [6]:
# Converting to a dataframe
df_countries = pd.DataFrame(cleaned_countries.strip().split('\n'), columns=['Country'])
df_countries.head()

Unnamed: 0,Country
0,Abkhazia
1,Afghanistan
2,Albania
3,Algeria
4,Andorra


# Analyzing Entities Identified by Spacy

In [7]:
# Visualizing entities identified by Spacy
article = NER(data)
displacy.render(article[:800], style='ent', jupyter=True)

Spacy is doing a good job at identifying geopolitical entities (GPE) but is identifying more than just countries. It would be best to create an entity list based on the country list previously scraped to define country relationships.

# Creating an Entity List

In [8]:
# Generating entity list for each sentence
df_sentences = []

for sent in article.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({'Sentence': sent, 'Entities': entity_list})

df_sentences = pd.DataFrame(df_sentences)
df_sentences.head(50)

Unnamed: 0,Sentence,Entities
0,"(The, th, century, changed, the, world, in, un...",[]
1,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
2,"(These, advancements, have, played, a, signifi...","[the st century, today]"
3,"(The, new, beginning, of, the, th, century, ma...",[]
4,"(The, s, saw, the, decade, herald, a, series, ...",[the decade]
5,"(saw, the, completion, of, the, Panama, Canal, .)",[the Panama Canal]
6,"(The, Scramble, for, Africa, continued, in, th...","[Scramble, Africa]"
7,"(The, atrocities, in, the, Congo, Free, State,...",[the Congo Free State]
8,"(From, to, ,, the, First, World, War, ,, and, ...",[the First World War]
9,"(The, First, World, War, (, or, simply, WWI, )...","[The First World War, WWI, The Great War, July..."


# Filtering Entities Based on Country List Scraped

In [9]:
# Funtion to remove irrelevant entities
def entity_filter(entities, countries):
    return [ent for ent in entities if ent in list(countries)]


# Testing Function
entity_filter(['France', 'Sweden', 'walk'], df_countries['Country'])

['France', 'Sweden']

In [10]:
# Applying function to entities in df_sentences
df_sentences['Article_Entities'] = df_sentences['Entities'].apply(lambda x: entity_filter(x, df_countries['Country']))
df_sentences['Article_Entities'].head(100)

0           []
1           []
2           []
3           []
4           []
        ...   
95          []
96    [France]
97    [France]
98     [Italy]
99    [Greece]
Name: Article_Entities, Length: 100, dtype: object

In [11]:
# Removing empty lists
df_sent_filtered = df_sentences[df_sentences['Article_Entities'].map(len) > 0]
df_sent_filtered.tail(10)

Unnamed: 0,Sentence,Entities,Article_Entities
1212,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, Partition between In...",[Pakistan]
1221,"(^, "", The, Philippines, ,, â€, “, |, US, Hous...","[Philippines, US House of Representatives: His...",[Philippines]
1253,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Postcolonial Borders,...",[Afghanistan]
1294,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1307,"(Now, ,, North, Korea, may, be, the, one, true...","[North Korea, one]",[North Korea]
1355,"("", Selling, ', Operation, Passage, to, Freedo...","[Thomas Dooley, the Religious Overtones of Ear...",[Vietnam]
1387,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Stuck in Endless Preliminaries, Vietnam, the ...",[Vietnam]
1670,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American Behavior, the Middle East, a Fi...",[Lebanon]
1677,"(The, Rise, of, China, and, India, :, A, New, ...","[India, New Asian]",[India]
1678,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


# Defining Country Relationships

In [12]:
# Creating relationship list with function limiting simultaneous iteration to 5
relationships = []

for i in range(df_sent_filtered.index[-1]):
    end_i = min(i+5, df_sent_filtered.index[-1])
    entity_list = sum((df_sent_filtered.loc[i: end_i].Article_Entities), [])

    # Removing duplicated entities that are next to each other
    uniqueEntity = [entity_list[i] for i in range(len(entity_list)) if (i == 0) or entity_list[i] != entity_list[i-1]]

    if len(uniqueEntity) > 1:
        for idx, a in enumerate(uniqueEntity[:-1]):
            b = uniqueEntity[idx + 1]
            relationships.append({'source': a, 'target': b})

In [13]:
# Converting identified relationships to a data frame
relationship_df = pd.DataFrame(relationships)
relationship_df

Unnamed: 0,source,target
0,France,Austria
1,Austria,Hungary
2,France,Austria
3,Austria,Hungary
4,Hungary,Russia
...,...,...
687,India,Singapore
688,India,Singapore
689,India,Singapore
690,India,Singapore


In [14]:
# Sorting cases with a-b and b-a
df_relationships = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns=relationship_df.columns)
df_relationships

Unnamed: 0,source,target
0,Austria,France
1,Austria,Hungary
2,Austria,France
3,Austria,Hungary
4,Hungary,Russia
...,...,...
687,India,Singapore
688,India,Singapore
689,India,Singapore
690,India,Singapore


In [15]:
# Summarizing country relationships
df_relationships['Interactions'] = 1
df_relationships = df_relationships.groupby(['source', 'target'], sort=False, as_index=False).sum()
df_relationships.head(10)

Unnamed: 0,source,target,Interactions
0,Austria,France,6
1,Austria,Hungary,6
2,Hungary,Russia,5
3,Germany,Russia,21
4,Germany,Ukraine,10
5,Germany,Italy,20
6,Austria,Germany,10
7,France,Spain,1
8,France,Poland,14
9,France,Germany,19


# Exporting Dataframes as Pickle Files

In [17]:
df_countries.to_pickle(r'D:\Data_Analysis\13-11-2025_Network_Visualization\03.Scripts\20th-century\country_list.pkl')
df_relationships.to_pickle(
    r'D:\Data_Analysis\13-11-2025_Network_Visualization\03.Scripts\20th-century\country_relationships.pkl')