# 1.6 Intro to NLP and Network Analysis

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re


In [2]:
# Download English module

!{sys.executable} -m spacy download en_core_web_sm

zsh:1: parse error near `-m'


In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

In [45]:

import pycountry

# We get a list of country names
country_list = [country.name for country in pycountry.countries]

print("Loaded", len(country_list), "countries")
print(country_list[:10])  # Check first 10

Loaded 249 countries
['Aruba', 'Afghanistan', 'Angola', 'Anguilla', 'Åland Islands', 'Albania', 'Andorra', 'United Arab Emirates', 'Argentina', 'Armenia']


## Load 20 century events 

In [14]:
# Load the book

with open('20Cent copy.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

### Text wrangling 

In [None]:
import re

# Preperation
# with open("your_file.txt", "r", encoding="utf-8") as f:
#     data = f.read()

# 1️⃣ Detect special characters
special_chars = re.findall(r"[^\w\s]", data)
unique_special = sorted(set(special_chars))

print("🔹 Special characters found:")
print(f"- Total special characters: {len(special_chars)}")
print(f"- Unique special characters: {unique_special}")



#The text in the lower case
data_lower = data.lower()

#Let's find the countries that are partially mentioned
found_countries = []
for c in country_list:
    if c.lower() in data_lower:
        found_countries.append(c)

missing_countries = sorted(set(country_list) - set(found_countries))

print("\n🔹 Country analysis:")
print(f"- Countries found ({len(found_countries)}): {found_countries}")
print(f"- Missing countries ({len(missing_countries)}): {missing_countries}")
print(f"- Coverage: {len(found_countries)}/{len(country_list)} countries "
      f"({(len(found_countries)/len(country_list))*100:.1f}%)")

🔹 Special characters found:
- Total special characters: 2432
- Unique special characters: ['"', '&', "'", '(', ')', ',', '-', '.', '/', ':', ';', '?', '[', ']', '|', '®', '–', '—']

🔹 Country analysis:
- Countries found (35): ['Afghanistan', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Bulgaria', 'Bosnia and Herzegovina', 'Canada', 'China', 'Cuba', 'Germany', 'Egypt', 'Spain', 'France', 'United Kingdom', 'Hong Kong', 'Hungary', 'India', 'Ireland', 'Israel', 'Italy', 'Jordan', 'Japan', 'Mali', 'Montenegro', 'New Zealand', 'Oman', 'Pakistan', 'Poland', 'Romania', 'Rwanda', 'Slovakia', 'Ukraine', 'United States', 'South Africa']
- Missing countries (214): ['Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Aruba', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia, Plurinational State of', 'Bonaire, Sint Eustatius and Saba', 'Botswana', 'Bouvet 

In [16]:
# Clean text and save 
if len(unique_special) > 0 or len(missing_countries) > 0:
    cleaned_text = re.sub(r'[^\w\s]', '', data)
    try:
        with open('cleaned_data.txt', 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
    except IOError as e:
        print(f"Error writing to file: {e}")


In [17]:
book = NER(data)

In [None]:
# Visualize identified entities

displacy.render(book, style = "ent", jupyter = True)

## Get named entity list per sentence

In [46]:
# Empty list for results
df_sentences = []

# Go through all the sentences
for sent in book.sents:
    # Collect the list of entities in the sentence
    entity_list = [ent.text for ent in sent.ents]
    
    # Add a dictionary to the list
    df_sentences.append({
        "sentence": sent.text,
        "entities": entity_list
    })

# Convert a list to a DataFrame
df_sentences = pd.DataFrame(df_sentences)

In [47]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,,[]
1,WikipediaThe,[]
2,Free EncyclopediaSearchDonateCreate accountLog...,"[the end of the 20th, beta)AutomaticLightDarkF..."
3,It was the first photograph taken of Earth fro...,"[first, Earth]"
4,Millennia2nd millenniumCenturies19th century 2...,"[Millennia2nd millenniumCenturies19th, century..."
5,centuryState leaders19th century 20th century ...,"[leaders19th century 20th century, 1910s 1920s..."
6,"Population growth was also unprecedented,[3] a...","[the century, around 1.6 billion, 20th century..."
7,Unprecedented advances in science and technolo...,[the century]
8,"The Earth's sixth mass extinction event, the H...","[Earth, sixth, Holocene]"
9,Major themes of the century included decoloniz...,[the century]


## Country filter 

In [48]:
countries = found_countries
def filter_entities(entity_list, allowed_entities):
    allowed_lower = [c.lower() for c in allowed_entities]
    filtered_entities = [
        ent for ent in entity_list
        if any(c in ent.lower() for c in allowed_lower)
    ]
    print("Filtered entities:", filtered_entities)
    return filtered_entities

In [49]:

df_sentences["filtered_entities"] = df_sentences["entities"].apply(
    lambda ents: filter_entities(ents, countries)
)
# filtration 
df_sentences_filtered = df_sentences[df_sentences["filtered_entities"].map(len) > 0]


Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: []
Filtered entities: ['France', 'the United States', 'Italy', 'Romania', 'Germany', 'Austria', 'Hungary', 'the Ottoman Empire', 'Bulgaria']
Filtered entities: ['Germany']
Filtered entities: ['Ottoman']
Filtered entities: []
Filtered entities: []
Filtered entities: ['Italy', 'Germany', 'Spain', "Nazi Germany's"]
Filtered entities: ['Japan', 'Germany', 'Italy']
Filtered entities: ['Japan', 'the United States', 'Germany', 'Poland', 'the United States', 'the United Kingdom', 'Canada', 'France']
Filtered entities: ['Japan']
Filtered entiti

In [50]:
# check results 
print("Sentences with filtered entities:")
print(df_sentences_filtered)

Sentences with filtered entities:
                                              sentence  \
21   After more than four years of trench warfare i...   
22   In addition to annexing many of the colonial p...   
23   The Austro-Hungarian and Ottoman empires were ...   
26   Fascism, a movement which grew out of post-war...   
27   Meanwhile, Japan had rapidly transformed itsel...   
..                                                 ...   
368                       United States Census Bureau.   
413  May 11, 2003 "China's great famine: 40 years l...   
415                               "The Indochina Wars:   
502  [Understanding the Armenian genocide: 1915 to ...   
505  "They Can Live in the Desert but Nowhere Else"...   

                                              entities  \
21   [more than four years, Western Europe, up to, ...   
22                       [the Triple Entente, Germany]   
23                                           [Ottoman]   
26   [the Great Depression of the 193

## Create relationships

In [53]:
# Parameter: size of the sentence window
window_size = 5

# Empty list for connections
relationships = []

# Check if there are any strings at all
if not df_sentences_filtered.empty:

    # Go through the row indices
    for i in range(df_sentences_filtered.index[-1]):
        end_i = min(i + window_size, df_sentences_filtered.index[-1])

        # Collect all countries in this window
        country_list = sum(
            (df_sentences_filtered.loc[i:end_i, "filtered_entities"]),
            []
        )

        # Remove consecutive repetitions
        country_unique = [
            country_list[j]
            for j in range(len(country_list))
            if j == 0 or country_list[j] != country_list[j-1]
        ]

        # Creating pairs
        if len(country_unique) > 1:
            for idx, a in enumerate(country_unique[:-1]):
                b = country_unique[idx + 1]
                relationships.append({"source": a, "target": b})

else:
    print("❌ DataFrame порожній — немає речень із країнами.")

# Convert a list to a DataFrame
relationships_df = pd.DataFrame(relationships)

# Check the result
print("✅ created relationships_df:")
print(relationships_df.head())

✅ created relationships_df:
              source             target
0             France  the United States
1  the United States              Italy
2              Italy            Romania
3            Romania            Germany
4            Germany            Austria


In [43]:
relationship_df = pd.DataFrame(relationships) 
relationship_df = pd.DataFrame(
    np.sort(relationship_df.values, axis=1),
    columns=relationship_df.columns
)
relationship_df["value"] = 1

relationship_df = relationship_df.groupby(
    ["source", "target"], as_index=False
).sum()
relationship_df.head(20)

Unnamed: 0,source,target,value
0,1945.[22]Russo-Japanese War,the Empire of Japan,6
1,Afghanistan,Cuba,6
2,Afghanistan,Egypt,5
3,Afghanistan,Poland,6
4,Argentina,the United States,5
5,Armenian,Armenians,6
6,Armenians,the Ottoman Empire,6
7,Australia,Canada,6
8,Australia,New Zealand,6
9,Austria,Germany,6


In [44]:
relationship_df.to_csv('20Cent.csv')