# 1.6 Intro to NLP and Network Analysis - 20th Century Data

In [2]:
#Import Libraries
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [4]:
# Download English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------  12.6/12.8 MB 65.9 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 65.9 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 65.9 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 65.9 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 13.6 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

## Load 20th Century Data

#### Data Wrangling - Cleaning

In [17]:
# Load the text file
with open(r'C:\Users\north\OneDrive\Dokumente\Career Foundry\Data Visualization 1\20th-Century\Data\key_events_20th_century.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [37]:
# Check for Special Characters in the Text
special_characters = re.findall(r'[^\x00-\x7F]', text)  # Non-ASCII characters
unique_special_characters = set(special_characters)
print("Non-standard characters found:", unique_special_characters)

Non-standard characters found: {'／', 'í', 'ö', '県', 'ã', '–', '縄', '’', '°', 'é', '沖', '\xa0', '—'}


In [58]:
def clean_non_standard_chars(text):
    # Define replacements for the identified non-standard characters
    replacements = {
        '／': '/',     # Full-width slash to regular slash
        'í': 'i',     # Accented "i" to plain "i"
        'ö': 'o',     # Accented "o" to "oe"
        '県': '',      # Remove if irrelevant (Kanji character for "prefecture")
        'ã': 'a',     # Accented "a" to "ae"
        '–': '-',     # En-dash to hyphen
        '—': '-',     # Em-dash to hyphen
        '縄': '',      # Remove if irrelevant (Kanji character for "rope")
        '’': "'",     # Curly apostrophe to straight apostrophe
        '°': ' degrees',  # Degree symbol
        'é': 'e',     # Accented "e" to plain "e"
        '沖': '',      # Remove if irrelevant (Kanji character for "Okinawa")
        '\xa0': ' ',  # Non-breaking space to regular space
    }
    
    # Apply replacements
    for key, value in replacements.items():
        text = text.replace(key, value)

   # Remove non-Latin characters (if irrelevant)
    text = re.sub(r'[^\x00-\x7F]+', '', cleaned_text)  # Remove non-ASCII characters

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Ensure spacing around punctuation
    text = re.sub(r'\s([.,!?;:])', r'\1', text)  # Remove space before punctuation
    text = re.sub(r'([.,!?;:])([^\s])', r'\1 \2', text)  # Ensure one space after punctuation
    
    # Return the cleaned text
    return text

In [50]:
# Apply cleaning to the loaded text
cleaned_text = clean_non_standard_chars(text)

In [60]:
# Save the cleaned text to a new file
output_file_path = r'C:\Users\north\OneDrive\Dokumente\Career Foundry\Data Visualization 1\20th-Century\Data\cleaned_key_events_20th_century.txt'
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

In [64]:
# Load the cleaned text file
with open(r'C:\Users\north\OneDrive\Dokumente\Career Foundry\Data Visualization 1\20th-Century\Data\cleaned_key_events_20th_century.txt', 'r', encoding='utf-8') as file:
    cleaned_text = file.read()


In [90]:
from spacy import displacy
from spacy.tokens import Span

# Process the text with SpaCy
doc = NER(cleaned_text)

# Filter for GPE entities
gpe_entities = [ent for ent in doc.ents if ent.label_ == "GPE"]

# Create a new Doc object with only GPE entities
doc_gpe = doc.copy()  # Create a copy to preserve the original Doc
doc_gpe.ents = gpe_entities  # Replace the entities with only GPE entities

# Visualize GPE entities
displacy.render(doc_gpe, style="ent", jupyter=True)

In [92]:
# Load the country list from CSV
countries = pd.read_csv(r'C:\Users\north\OneDrive\Dokumente\Career Foundry\Data Visualization 1\20th-Century\Data\countries_list_20th_century_1.5.csv')
country_list = countries['country_name'].str.strip().str.lower().tolist()

In [144]:
# Step 1: Normalize country list
normalized_country_list = [country.strip().lower() for country in country_list]

# Step 2: Extract and normalize GPE entities
normalized_gpe_entities = [
    normalization_mapping.get(gpe.strip(), gpe.strip()).lower() for gpe in gpe_entities
]

# Step 3: Filter normalized GPE entities to include only valid countries
filtered_countries = [
    gpe for gpe in normalized_gpe_entities if gpe in normalized_country_list
]

# Step 4: Identify non-country GPE entities
non_country_gpe = [
    gpe for gpe in normalized_gpe_entities if gpe not in normalized_country_list
]

# Step 5: Print results
print("Filtered Countries:", set(filtered_countries))
print("Non-country GPE Entities:", set(non_country_gpe))


Filtered Countries: {'belarus', 'canada', 'india', 'norway', 'spain', 'iran', 'mongolia', 'latvia', 'luxembourg', 'albania', 'cuba', 'estonia', 'poland', 'solomon islands', 'iraq', 'ghana', 'libya', 'morocco', 'romania', 'kenya', 'ukraine', 'japan', 'angola', 'finland', 'greece', 'hungary', 'sweden', 'bulgaria', 'philippines', 'germany', 'afghanistan', 'russia', 'papua new guinea', 'australia', 'lebanon', 'france', 'cape verde', 'singapore', 'cambodia', 'vietnam', 'italy', 'bangladesh', 'united kingdom', 'united states', 'pakistan', 'belgium', 'laos', 'israel', 'lithuania', 'netherlands', 'austria', 'thailand', 'south africa', 'egypt', "china, people's republic of", 'denmark', 'algeria'}
Non-country GPE Entities: {'chongqing', 'signed', 'koreas', 'estrada', 'new york: psychology press', 'the soviet union', 'stalingrad', 'ghooi', 'poole', 'rome', 'sofia', 'tokyo', 'san francisco', 'crimea', 'kellaway', 'prague', 'new guinea', 'british empire', 'kagaayi', 'south korea', 'saigon', 'nazi g

In [146]:
for gpe in normalized_gpe_entities:
    if gpe not in normalized_country_list:
        print(f"Mismatch: {gpe}")

Mismatch: sarajevo
Mismatch: british empire
Mismatch: the ottoman empire
Mismatch: tsar
Mismatch: the ottoman empire
Mismatch: yugoslavia
Mismatch: czechoslovakia
Mismatch: the soviet union
Mismatch: the soviet union's
Mismatch: czechoslovakia
Mismatch: the soviet union
Mismatch: moscow
Mismatch: czechoslovakia
Mismatch: soviet union
Mismatch: london
Mismatch: the soviet union
Mismatch: ussr
Mismatch: the soviet union
Mismatch: the low countries
Mismatch: paris
Mismatch: berlin
Mismatch: the soviet union
Mismatch: yugoslavia
Mismatch: yugoslavia
Mismatch: ussr
Mismatch: the soviet union
Mismatch: leningrad
Mismatch: saint petersburg
Mismatch: moscow
Mismatch: moscow
Mismatch: moscow
Mismatch: north africa
Mismatch: tobruk
Mismatch: the soviet union
Mismatch: persia
Mismatch: stalingrad
Mismatch: stalingrad
Mismatch: hawaii
Mismatch: north africa
Mismatch: the soviet union
Mismatch: rome
Mismatch: omaha beach
Mismatch: paris
Mismatch: paris
Mismatch: crimea
Mismatch: the soviet union
Mi

In [156]:
# Define the excluded_terms set
excluded_terms = set()

# Add the provided terms to excluded_terms
excluded_terms.update({
    'chongqing', 'signed', 'koreas', 'estrada', 'new york: psychology press', 'the soviet union', 
    'stalingrad', 'ghooi', 'poole', 'rome', 'sofia', 'tokyo', 'san francisco', 'crimea', 
    'kellaway', 'prague', 'new guinea', 'british empire', 'kagaayi', 'south korea', 'saigon', 
    'nazi germany', "the people's republic of china", 'deutschland', 'czechoslovakia', 
    'the u. s. s. r. "', 'virginia', 'henderson', 'north and south', 'brosnan', 'schultz', 
    'nagasaki', 'tsar', 'the low countries', 'goldstein', 'potsdam', 'between germany', 
    'sharp', 'keegan', 'daniel', 'persia', 'kinzer', 'murphy', 'omaha beach', 'iwo jima', 
    'north africa', 'assensoh', 'berlin', 'einsatzgruppen', 'stalin', 'burma', 'wilde', 
    'michaels', 'antonia', 'carruthers', 'fraser', 'colin', 'boston', 'indochina', 'stack', 
    'timor', 'pp', 'winston', 'egorov', 'north korea', 'freedom', 'nordregio', 
    'the ottoman empire', 'manchuria', 'east berlin', 'new york', 'lean', 'saint petersburg', 
    'parker', 'nanjing', 'serwadda', 'bucharest', 'malaya', 'oklahoma city', 'moscow', 
    'underdog', 'becky', 'saunders', 'yugoslavia', 'charlottesville', 'portuguese africa', 
    'west germany', 'vienna', 'catherwood', 'buchenwald', "north korea's", 'blumberg', 'tobruk', 
    'shanghai', 'hiroshima', 'joseph', 'maguire', "the soviet union's", 'toronto', 'belgrade', 
    'new orleans', 'churchill', 'hasic', 'zelizer', 'middleton', 'marks', 'sandford', 'warsaw', 
    'century ireland', 'leningrad', 'hong kong', 'east germany', 'findingdulcinea', 'inscribed', 
    'hongkiat', 'linge', 'korea', 'wannsee', 'gravel', 'budapest', 'ussr', 'sarajevo', 
    'north africa campaign', 'los alamos', 'the american states during', 'wylie', 
    'battle of britain', 'new mexico', 'paris', 'quest for empire', 'bali', 'london', 
    'guangxi', 'cumings', 'hawaii', 'darling', 'ferreira', 'borneo', 'changsha', 
    'soviet union', 'neville', 'guam', 'alfredo'
})

# Verify the updated excluded_terms set
print("Excluded Terms:", excluded_terms)


Excluded Terms: {'chongqing', 'signed', 'koreas', 'estrada', 'new york: psychology press', 'the soviet union', 'stalingrad', 'ghooi', 'poole', 'rome', 'sofia', 'tokyo', 'san francisco', 'crimea', 'kellaway', 'prague', 'new guinea', 'british empire', 'kagaayi', 'south korea', 'saigon', 'nazi germany', "the people's republic of china", 'deutschland', 'czechoslovakia', 'the u. s. s. r. "', 'virginia', 'henderson', 'north and south', 'brosnan', 'schultz', 'nagasaki', 'tsar', 'the low countries', 'goldstein', 'potsdam', 'between germany', 'sharp', 'keegan', 'daniel', 'kinzer', 'persia', 'murphy', 'omaha beach', 'iwo jima', 'north africa', 'assensoh', 'berlin', 'einsatzgruppen', 'stalin', 'burma', 'wilde', 'michaels', 'antonia', 'carruthers', 'fraser', 'colin', 'boston', 'indochina', 'stack', 'timor', 'pp', 'winston', 'egorov', 'north korea', 'freedom', 'nordregio', 'the ottoman empire', 'manchuria', 'east berlin', 'new york', 'lean', 'saint petersburg', 'parker', 'nanjing', 'serwadda', 'buc

In [186]:
    normalization_mapping.update({
    "the people's republic of bangladesh": "bangladesh",
    "us": "united states",
    "china": "china, people's republic of",
    "the people's republic of china": "china, people's republic of",
    "great britain": "united kingdom",
    "america": "united states",
    "usa": "united states",
    "the netherlands": "netherlands",
    "the united states": "united states",
    "the united kingdom": "united kingdom",
    "britain": "united kingdom",
    "united kingdom": "united kingdom",
    "uk": "united kingdom",
    "deutschland": "germany",
    "United Kingdom": "united kingdom"
})


# Normalize GPE entities
normalized_gpe_entities = [
    normalization_mapping.get(gpe.strip().lower(), gpe.strip().lower()) for gpe in gpe_entities
]

# Filter valid countries
filtered_countries = [gpe for gpe in normalized_gpe_entities if gpe in normalized_country_list]

# Identify non-country GPE entities
non_country_gpe = [
    gpe for gpe in normalized_gpe_entities if gpe not in normalized_country_list and gpe not in excluded_terms
]

# Print results
print("Filtered Countries:", set(filtered_countries))
print# Step 1: Split the text into sentences
sentences = list(ner_doc.sents)

# Step 2: Filter entities for each sentence and check against the country list
sentence_entities = []
for sentence in sentences:
    # Extract GPE entities in each sentence
    entities = [ent.text for ent in sentence.ents if ent.label_ == "GPE"]
    # Filter entities to include only valid countries
    valid_countries = [country for country in entities if country.lower().strip() in normalized_country_list]
    if valid_countries:
        sentence_entities.append((sentence.text, valid_countries))

# Display the first 5 sentences with their matched valid countries
for i, (sentence, countries) in enumerate(sentence_entities[:5], 1):
    print(f"Sentence {i}: {sentence}")
    print(f"Valid Countries: {countries}")
    print("-" * 80)
("Non-country GPE Entities:", set(non_country_gpe))


Filtered Countries: {'belarus', 'canada', 'india', 'norway', 'spain', 'iran', 'mongolia', 'latvia', 'luxembourg', 'albania', 'cuba', 'estonia', 'poland', 'solomon islands', 'iraq', 'ghana', 'libya', 'morocco', 'romania', 'kenya', 'ukraine', 'japan', 'angola', 'finland', 'greece', 'hungary', 'sweden', 'bulgaria', 'philippines', 'germany', 'afghanistan', 'russia', 'papua new guinea', 'australia', 'lebanon', 'france', 'cape verde', 'singapore', 'cambodia', 'vietnam', 'italy', 'bangladesh', 'united kingdom', 'united states', 'pakistan', 'belgium', 'laos', 'israel', 'lithuania', 'netherlands', 'austria', 'thailand', 'south africa', 'egypt', "china, people's republic of", 'denmark', 'algeria'}
Non-country GPE Entities: set()


In [189]:
# Define the path and filename for the cleaned text
output_file_path = "cleaned_key_events_20th_century.txt"

# Save the cleaned text to the file
with open(output_file_path, "w", encoding="utf-8") as file:
    file.write(cleaned_text)

print(f"Cleaned file successfully saved as: {output_file_path}")


Cleaned file successfully saved as: cleaned_key_events_20th_century.txt


### Observations and Cleaning Steps

1. **Special Characters**:
   - The text contained non-standard characters such as `—`, `‘`, `’`, `°`, and others.
   - These were replaced or normalized to standard equivalents (e.g., `—` to `-` and `‘` to `'`).

2. **Country Names**:
   - Some country names in the text did not match the `country_list` directly due to formatting, historical names, or alternate spellings (e.g., `Great Britain`, `United States`).
   - A normalization mapping was created to map alternate names to their modern equivalents (e.g., `the people's republic of china": "china, people's republic of`).

3. **Mismatched GPE Flags**:
   - Some entities detected as `GPE` (Geopolitical Entities) were not actual countries,  (e.g., `North Africa`, `Berlin`, `Tobruk`).
   - A list of exclusions was created to filter out these non-country GPEs.

4. **Actions Taken**:
   - Special characters were replaced or removed.
   - Country names were normalized using a mapping.
   - Non-country GPE entities were filtered out based on the exclusion list.
   - The cleaned text was saved as a `cleaned_key_events_20th_century.txt` file for further use.

---


## Create NER Object

In [201]:
# Path to the cleaned text file
file_path = "cleaned_key_events_20th_century.txt"

# Load the cleaned text from the file
with open(file_path, "r", encoding="utf-8") as file:
    cleaned_text = file.read()

# Create the NER object by processing the text using the correct variable (NER)
ner_doc = NER(cleaned_text)

# Visualize identified entities

displacy.render(ner_doc[273:20000], style = "ent", jupyter = True)

## Get named entity list per sentence

In [314]:
# Create an empty list to store sentence and entities
df_sentences = []

# Extract sentences and their GPE entities
for sent in ner_doc.sents:  # Replace ner_doc with your actual NER object
    entities = [ent.text for ent in sent.ents]  # Extract all entities in the sentence
    df_sentences.append({"sentence": sent.text, "entities": entities})

# Convert to DataFrame
import pandas as pd
df_sentences = pd.DataFrame(df_sentences)

# Display the structure of the updated DataFrame
print(df_sentences.head())



                                            sentence  \
0  The 20th century changed the world in unpreced...   
1  The World Wars sparked tension between countri...   
2  These advancements have played a significant r...   
3  Historic events in the 20th century [ edit ] W...   
4  Edwardian era The new beginning of the 20th ce...   

                                           entities  
0                                [The 20th century]  
1                    [the Cold War, the Space Race]  
2                         [the 21st century, today]  
3  [the 20th century, the beginning of the century]  
4                     [Edwardian, the 20th century]  


## Filtering entities - country list

In [316]:
# Define the filter function
def filter_countries(entity_list, country_list):
    return [entity for entity in entity_list if entity.lower().strip() in country_list]

# Apply the filter function to create a new column for valid countries
df_sentences['valid_countries'] = df_sentences['entities'].apply(
    lambda x: filter_countries(x, normalized_country_list)
)

# Display the updated DataFrame
print(df_sentences.head())



                                            sentence  \
0  The 20th century changed the world in unpreced...   
1  The World Wars sparked tension between countri...   
2  These advancements have played a significant r...   
3  Historic events in the 20th century [ edit ] W...   
4  Edwardian era The new beginning of the 20th ce...   

                                           entities valid_countries  
0                                [The 20th century]              []  
1                    [the Cold War, the Space Race]              []  
2                         [the 21st century, today]              []  
3  [the 20th century, the beginning of the century]              []  
4                     [Edwardian, the 20th century]              []  


In [318]:
# Filter rows where valid_countries is not empty
df_sentences_filtered = df_sentences[df_sentences['valid_countries'].map(len) > 0]

# Display the filtered DataFrame
print(df_sentences_filtered.head())


                                             sentence  \
15  [ 2 ] The Allies, known initially as "The Trip...   
16  Germany, Austria-Hungary, Bulgaria, and later ...   
17  [ 3 ] [ 4 ] In 1917, Russia ended hostile acti...   
18  The Bolsheviks negotiated the Treaty of Brest-...   
19  In the treaty, Bolshevik Russia ceded the Balt...   

                                             entities  \
15  [2, The Triple Entente, British Empire, France...   
16  [Germany, Austria, Hungary, Bulgaria, the Otto...   
17     [3, 4, 1917, Russia, the Central Powers, Tsar]   
18     [the Treaty of Brest-Litovsk, Germany, Russia]   
19  [Bolshevik Russia, Baltic, Germany, Kars Oblas...   

                          valid_countries  
15                       [France, Russia]  
16  [Germany, Austria, Hungary, Bulgaria]  
17                               [Russia]  
18                      [Germany, Russia]  
19                              [Germany]  


## Create Relationships - Dataframe

In [331]:
# Define a window size to look at a group of sentences
window_size = 5  # This defines how many sentences will be looked at simultaneously
relationships = []  # Create an empty list to store relationships

for i in df_sentences_filtered.index:
    # Define the range of sentences to look at
    end_i = min(i + window_size, df_sentences_filtered.index[-1] + 1)
    
    # Get the list of valid countries in the current window of sentences
    country_list = sum(df_sentences_filtered.loc[i:end_i, "valid_countries"], [])
    
    # Remove consecutive duplicate countries
    country_unique = [country_list[j] for j in range(len(country_list)) 
                      if (j == 0) or country_list[j] != country_list[j - 1]]
    
    # If there is more than one country in the window, create relationships
    if len(country_unique) > 1:
        for idx, source in enumerate(country_unique[:-1]):
            target = country_unique[idx + 1]
            relationships.append({"source": source, "target": target})

# Convert the relationships list to a DataFrame
df_relationships = pd.DataFrame(relationships)


In [335]:
df_relationships

Unnamed: 0,source,target
0,France,Russia
1,Russia,Germany
2,Germany,Austria
3,Austria,Hungary
4,Hungary,Bulgaria
...,...,...
324,India,Pakistan
325,Romania,Russia
326,Lebanon,India
327,India,Singapore


In [329]:
# Sort relationships to make them direction-agnostic
df_relationships = pd.DataFrame(np.sort(df_relationships.values, axis=1), columns=df_relationships.columns)

# Add a 'value' column to count occurrences of each relationship
df_relationships["value"] = 1

# Group by source and target to aggregate duplicate relationships
df_relationships = df_relationships.groupby(["source", "target"], sort=False, as_index=False).sum()

# Display the cleaned and aggregated DataFrame
print(df_relationships.head(5))

# Save the summarized relationships DataFrame
df_relationships.to_csv("country_relationships_summarized.csv", index=False)
print("Summarized country relationships saved as country_relationships_summarized.csv")


     source   target  value
0    France   Russia      2
1   Germany   Russia     14
2   Austria  Germany      7
3   Austria  Hungary      2
4  Bulgaria  Hungary      2
Summarized country relationships saved as country_relationships_summarized.csv
