# Advanced Data mining NLP NER

In [2]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [4]:
import sys
print(sys.executable)

C:\Users\analy\anaconda3\envs\century_env\python.exe


In [5]:
!C:\Users\analy\anaconda3\envs\century_env\python.exe -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 6.3 MB/s eta 0:00:02
     --------- ------------------------------ 3.1/12.8 MB 8.0 MB/s eta 0:00:02
     ----------------- ---------------------- 5.5/12.8 MB 9.3 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 11.3 MB/s eta 0:00:01
     ------------------------------------ -- 12.1/12.8 MB 12.0 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 11.5 MB/s  0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


spaCy itself is just the NLP engine.
To actually process text — tokenize, tag parts of speech, recognize entities — it needs a language model.

en_core_web_sm is the small English model.

In [7]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

# Load 20th century key events txt

Here we will use NER spaCy NLP modern method that 
tokenizes

lemmatizes

identifies named entities (PERSON, ORG, GPE, etc.)

splits sentences

extracts relationships

Note: both TextBlob(manuel method togerther with other manuel cleaning) and NER spaCy need the txt file as input after it is being scraped and converted from html to txt.

In [8]:
# Load the scraped text

with open('key_events_20th_century.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [11]:
data_NER = NER(data) # Extracts named entities (PERSON, ORG, GPE, DATE, etc.)

In [10]:
# Visualize identified entities

displacy.render(data_NER[273:20000], style = "ent", jupyter = True)

after scraping, some data wrangeling and cleaning were performed and then the txt file was saved to be ready for any NLP like NER.

lowercase to be compatible with the scraped list of countries later should be done for the Entities later and not for the original text before NER.

# Get named entity list per sentence

In [13]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in data_NER.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [14]:
df_sentences

Unnamed: 0,sentence,entities
0,"(Jump, to, contentMain, menuMain, menumove, to...",[Jump]
1,"(Color(beta)AutomaticLightDarkThis, page, is, ...",[]
2,"(DonateCreate, accountLog, inPersonal, toolsDo...","[sidebarhide(Top)1Historic, 20th, Historic, th..."
3,"(Overlord1.4.6Final, days1.4.7The, war, in, th...","[Holocaust1.4.12The Nuclear Age begins1.5The, ..."
4,"(Edit, linksArticleTalkEnglishReadEditView, hi...","[URLDownload QR, PDFPrintable, CommonsWikidata..."
...,...,...
900,"(p., Â, , 600.ISBN978, -, 0, -, 415, -, 09311...","[600.ISBN978, links[edit]Wikimedia Commons, th..."
901,"(The, 20th, Century, Research, Project(archive...","[20th, 26, February, February, the 20th Centur..."
902,"(By, using, this, site, ,, you, agree, to, the...",[]
903,"(WikipediaÂ, ®, is, a, registered, trademark, ...","[WikipediaÂ®, theWikimedia Foundation, Inc.]"


## Load countries scraped list.txt

In [15]:
# Import the Scraped countires list txt file from task1.4

with open("Scraped List of Countries.txt", "r", encoding="utf-8") as f:
    countries = [line.strip() for line in f.readlines() if line.strip()]


In [27]:
# IMPORTANT!! here i am extending the countires list manually to include the old countries names
# this have to match the 20th century names for the lookup step later
countries.extend([
    "soviet union",
    "ussr",
    "yugoslavia",
    "czechoslovakia",
    "east germany",
    "west germany",
    "persia",
    "ottoman empire"
])


In [28]:
# Step 2: Convert list to DataFrame
countries_df = pd.DataFrame(countries, columns=["Country"])
countries_df

Unnamed: 0,Country
0,aruba
1,afghanistan
2,angola
3,anguilla
4,åland islands
...,...
252,czechoslovakia
253,east germany
254,west germany
255,persia


# Filtering entities from the data_NER

In [29]:
# lowercase the Entities first to match the lowercase in countries_df
df_sentences["entities"] = df_sentences["entities"].apply(
    lambda ents: [e.lower() for e in ents]
)

In [30]:
# Defining a function to filter out entities not of interest
# similar to xlookup in excel. ent_list will be the extracted enteties for each sentence

def filter_entity(ent_list, countries_df):
    return [ent for ent in ent_list 
            if ent in list(countries_df['Country'])]

In [31]:
# here the function will be applied
df_sentences['countries_entities'] = df_sentences['entities'].apply(
    lambda x: filter_entity(x, countries_df)
)

In [32]:
df_sentences.head(10)

Unnamed: 0,sentence,entities,countries_entities
0,"(Jump, to, contentMain, menuMain, menumove, to...",[jump],[]
1,"(Color(beta)AutomaticLightDarkThis, page, is, ...",[],[]
2,"(DonateCreate, accountLog, inPersonal, toolsDo...","[sidebarhide(top)1historic, 20th, historic, th...",[]
3,"(Overlord1.4.6Final, days1.4.7The, war, in, th...","[holocaust1.4.12the nuclear age begins1.5the, ...",[]
4,"(Edit, linksArticleTalkEnglishReadEditView, hi...","[urldownload qr, pdfprintable, commonswikidata...",[]
5,"(TheWorld, Warssparked, tension, between, coun...","[theworld warssparked, theworld wide webwas]",[]
6,"(These, advancements, have, played, a, signifi...",[today],[]
7,"(Historic, events, in, the, 20th, century[edit...","[the 20th, edwardian, 1914the, the 20th century]",[]
8,"(The, 1900s, saw, the, decade, herald, a, seri...","[the 1900s, the decade]",[]
9,"(1914, saw, the, completion, of, thePanama, Ca...","[1914, thepanama canal]",[]


In [33]:
# Filter out rows where the countries_entities is 0 or empty with naming filtered df

df_sentences_filtered = df_sentences[df_sentences['countries_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,countries_entities
548,"(Glass, (, 14, August, 2018).""Hirohito, accept...","[glass, 14 august, 2018).""hirohito, japan, aug...",[japan]
577,"(Perspectives, in, Clinical, Research.2(2):72â...",[israel],[israel]
639,"(Past, &, Present(210):103â€“120.doi:10.1093, ...",[past & present(210):103â€“120.doi:10.1093/pas...,[germany]
653,"(Retrieved20, December2018.^McQuade, ,, Joseph...","[joseph, november, india]",[india]
656,"(December2018.^Henderson, ,, Barney, (, 15, Au...","[barney, 15, august, partition between india, ...",[pakistan]
662,"(Retrieved18, December2018.^""The, Philippines,...","[philippines, 1898â€“1946, us house of represe...",[philippines]
678,"("", Colonial, Cartographies, ,, Postcolonial, ...","[colonial cartographies, postcolonial borders,...","[afghanistan, pakistan]"
699,"(Retrieved20, December2018.^King, ,, Charles, ...","[charles, 2000).the, moldovans, romania, russi...",[romania]
881,"(The, Journal, of, Politics.78(1):311â€“325.do...",[the journal of politics.78(1):311â€“325.doi:1...,[india]
882,"(Singapore, :, World, Scientific.doi:10.1142/7...","[singapore, south korean, meet""]",[singapore]


no furthere wrangeling is needed

# Create Relationships

In [53]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].countries_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [54]:
relationship_df = pd.DataFrame(relationships)

In [55]:
relationship_df

Unnamed: 0,source,target
0,france,germany
1,france,germany
2,france,germany
3,france,germany
4,germany,italy
...,...,...
433,india,singapore
434,india,singapore
435,india,singapore
436,india,singapore


In [56]:
# replace the names "ussr" with "soviet union" to have uniform name
relationship_df = relationship_df.replace("ussr", "soviet union")

In [57]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,france,germany
1,france,germany
2,france,germany
3,france,germany
4,germany,italy


In [58]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df_grouped = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [59]:
relationship_df_grouped

Unnamed: 0,source,target,value
0,france,germany,25
1,germany,italy,27
2,austria,germany,11
3,czechoslovakia,france,6
4,france,poland,15
5,germany,poland,18
6,poland,soviet union,15
7,germany,soviet union,6
8,germany,latvia,5
9,finland,latvia,6


In [60]:
relationship_df_grouped.to_csv('20th_century_countries_relationship.csv')