In [1]:
#Importing Libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
import re
import os

In [2]:
NER = spacy.load("en_core_web_sm")

In [6]:
#Loading text file 
with open('20th_century_events.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', ' ')

In [7]:
book = NER(data)

In [8]:
#Step 4. Evaluatinf if the text needs wrangling
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

In [None]:
# Step 4. Wrangling : Remove unwanted characters
data = re.sub(r'[^A-Za-z0-9\s.,]', '', data)

In [10]:
#Step 5. Apply Named Entity  Recognition (NER)
doc = NER(data)

In [11]:
# Visualizing a small portion 
displacy.render(doc[:5000], style="ent", jupyter=True)

In [12]:
#Step 6: Split sentences and extract entities
df_sentences = []

for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent.text, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,Key events of the 20th century Wikipedia Jump...,"[the 20th century, Wikipedia Jump, Navigation ..."
1,The rise of dictatorship 1.4 Global war World ...,"[1.4, Global war World War II, 19391945 1.4.1]"
2,The war in Europe 1.4.2 Blitzkrieg 1.4.3 Opera...,"[Europe, 1.4.2, Blitzkrieg 1.4.3 Operation Bar..."
3,Turning tides 1.4.5 Operation Overlord 1.4.6,[Operation Overlord 1.4.6]
4,Final days 1.4.7 The war in the Pacific 1.4.7....,"[days 1.4.7, the Pacific 1.4.7.1 Background, 1..."
5,Allied offensive 1.4.10 Final days 1.4.11,"[Allied, days 1.4.11]"
6,The Holocaust 1.4.12 The Nuclear Age begins 1.5,"[The Nuclear Age, 1.5]"
7,The postwar world 1.5.1,[1.5.1]
8,The end of empires decolonization 1.5.2,[1.5.2]
9,The Cold War 19471991 1.5.3 War by proxy 1.5.4...,[The Cold War]


In [14]:
# Step 7: Filter for your countries list

In [None]:
# Importing csv countries file.
# after fixing importing problems or csv file reading.
# Read the semicolon-delimited file and skip the first row if it contains duplicate headers
df_countries_lookup = pd.read_csv("countries_list_20th_century_1.5.csv", sep=";", skiprows=1, names=["id","country_name"])

# Clean up whitespace
df_countries_lookup["country_name"] = df_countries_lookup["country_name"].str.strip()

# Preview
print(df_countries_lookup.head())

   id country_name
0   1  Afghanistan
1   2      Albania
2   3      Algeria
3   4      Andorra
4   5       Angola


In [24]:
# Filtering countries by the list
# Define filter function
def filter_entity(ent_list, countries_df):
    return [ent for ent in ent_list if ent in list(countries_df['country_name'])]

# Apply filter to your sentences DataFrame
df_sentences['country_entities'] = df_sentences['entities'].apply(
    lambda x: filter_entity(x, df_countries_lookup)
)

# Remove sentences that have no country entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

# Preview
df_sentences_filtered.head()

Unnamed: 0,sentence,entities,country_entities
28,After a period of diplomatic and military esca...,"[the July Crisis, the end of July 1914, the Br...",[France]
29,"1 2 In 1917, Russia ended hostile actions a...","[1, 1917, Russia, the Central Powers, Tsar]",[Russia]
30,The Bolsheviks negotiated the Treaty of BrestL...,"[Bolsheviks, Treaty, BrestLitovsk, Germany, Ru...","[Germany, Russia]"
31,"In the treaty, Bolshevik Russia ceded the Balt...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
32,It also recognized the independence of Ukraine...,[Ukraine],[Ukraine]


In [20]:
countries_df.head()
print(countries_df.columns)

Index(['1', ' Afghanistan '], dtype='object')


In [25]:
#Step 8. Creating a relationship dataframe
relationships = []
window_size = 5  # number of consecutive sentences to look at

for i in range(len(df_sentences_filtered)):
    end_i = min(i + window_size, len(df_sentences_filtered))
    country_list = sum(df_sentences_filtered.loc[i:end_i, 'country_entities'].tolist(), [])
    
    # Remove consecutive duplicates
    country_unique = [country_list[i] for i in range(len(country_list)) if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

relationship_df = pd.DataFrame(relationships)
relationship_df.head()

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Russia
4,France,Russia


In [26]:
# Summarizing interactions

#  Sort each pair alphabetically to avoid duplicates (A->B vs B->A)
relationship_df_sorted = pd.DataFrame(np.sort(relationship_df.values, axis=1), columns=relationship_df.columns)

# Add a counter for aggregation
relationship_df_sorted["value"] = 1

# Group by source-target pairs and sum occurrences
relationship_summary = relationship_df_sorted.groupby(["source", "target"], as_index=False).sum()

relationship_summary.head(10)

Unnamed: 0,source,target,value
0,Austria,Germany,10
1,Belgium,France,5
2,Belgium,Luxembourg,3
3,Denmark,Norway,12
4,Denmark,Poland,5
5,Denmark,Sweden,5
6,Estonia,Germany,5
7,Estonia,Latvia,6
8,Finland,Germany,4
9,Finland,Lithuania,5


In [None]:
# Exporting Realtionships Data Frame
# Save the summarized relationships to CSV
relationship_summary.to_csv("20th_century_country_relationships.csv", index=False)

# Optional: Save the filtered sentences as well
df_sentences_filtered.to_csv("20th_century_sentences_filtered.csv", index=False)