### This script contains the following:
1. Importing Libraries
2. Importing Data
3. Creating Named Entity Recognition Object
4. Splitting the Sentence Entities
5. Filtering the Entities Using the Country List
6. Creating a Relationship Dataframe
7. Exporting the Data

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
import nltk
from nltk.tokenize import word_tokenize

In [3]:
!C:\Users\hp\anaconda3\envs\mining_env\python.exe -m spacy download en_core_web_sm


Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\envs\mining_env\lib\runpy.py", line 187, in _run_module_as_main
    mod_name, mod_spec, code = _get_module_details(mod_name, _Error)
  File "C:\Users\hp\anaconda3\envs\mining_env\lib\runpy.py", line 146, in _get_module_details
    return _get_module_details(pkg_main_name, error)
  File "C:\Users\hp\anaconda3\envs\mining_env\lib\runpy.py", line 110, in _get_module_details
    __import__(pkg_name)
  File "C:\Users\hp\anaconda3\envs\mining_env\lib\site-packages\spacy\__init__.py", line 6, in <module>
  File "C:\Users\hp\anaconda3\envs\mining_env\lib\site-packages\spacy\errors.py", line 2, in <module>
    from .compat import Literal
  File "C:\Users\hp\anaconda3\envs\mining_env\lib\site-packages\spacy\compat.py", line 38, in <module>
    from thinc.api import Optimizer  # noqa: F401
  File "C:\Users\hp\anaconda3\envs\mining_env\lib\site-packages\thinc\api.py", line 1, in <module>
    from .backends import (
  File "C:\Users\h

In [5]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

## 2. Importing Data


In [8]:
# Load the article
path = os.path.join('..', 'Data', '20th Century Events_sans_punc.txt')
with open(path, 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [10]:
# Import the list of countries as a dataframe
path2 = os.path.join('..', 'Data')
countries = pd.read_csv(os.path.join(path2, 'cleaned_countries_list.csv'), index_col = 0)

In [12]:
countries.shape

(214, 3)

In [14]:
countries.head()

Unnamed: 0,country_name,country_alias,clean_country_alias
0,Afghanistan,Afghanistan,Afghanistan
1,Albania,Albania,Albania
2,Algeria,Algeria,Algeria
3,Andorra,Andorra,Andorra
4,Angola,Angola,Angola


## 3. Creating NER Object

In [17]:
# Set the NER object
article = NER(data)

In [19]:
%%capture

# Visualize identified entities
displacy.render(article[273:20000], style = "ent", jupyter = True)

## 4. Splitting the Sentence Entities

In [22]:
# Create an empty shell to store results
df_sentences = [] 

# Loop through sentences, to get entity list for each sentence
for sent in article.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
# Convert the list into a dataframe
df_sentences = pd.DataFrame(df_sentences)

In [24]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,( ),[]
1,"(Key, , events, , of, , th, , centu...",[Key ]
2,"(World, , War, )",[World War ]
3,"(I, , â€, , “, , .1.2Russi...",[]
4,"(depression1.2.2The, , rise, , of, , ...",[Pacific1.3.7.1Background1.3.8Japanese]
5,( ),[]
6,"(Nuclear, , Age, , begins1.4The, , po...",[]
7,"(proxy1.4.4The, , space, )",[]
8,"(race1.4.5The, , end, , of, , Cold, ...",[]
9,"(What, , links, , hereRelated, , chan...",[URLDownload]


## 5. Filter the Entities Using the Country List


In [27]:
# A function to filter out entities not on the cleaned countries list
def filter_entity(ent_list, countries):
    return [ent for ent in ent_list 
            if ent in list(countries['clean_country_alias'])]

In [29]:
# Apply the function and store the results in a new column
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries))

In [31]:
df_sentences.head(10)


Unnamed: 0,sentence,entities,country_entities
0,( ),[],[]
1,"(Key, , events, , of, , th, , centu...",[Key ],[]
2,"(World, , War, )",[World War ],[]
3,"(I, , â€, , “, , .1.2Russi...",[],[]
4,"(depression1.2.2The, , rise, , of, , ...",[Pacific1.3.7.1Background1.3.8Japanese],[]
5,( ),[],[]
6,"(Nuclear, , Age, , begins1.4The, , po...",[],[]
7,"(proxy1.4.4The, , space, )",[],[]
8,"(race1.4.5The, , end, , of, , Cold, ...",[],[]
9,"(What, , links, , hereRelated, , chan...",[URLDownload],[]


In [35]:
# Filter out sentences that don't have any character entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]
df_sentences_filtered.head(10)

Unnamed: 0,sentence,entities,country_entities
28,"(After, , a, , period, , of, , dip...","[two, Austria]",[Austria]
29,"(In, , Russia, , ended, , hos...",[Russia],[Russia]
30,"(Bolsheviks, , negotiated, , Treaty, ...",[Russia],[Russia]
31,"(In, , treaty, , Bolshevik, , Ru...","[Russia, Baltic]",[Russia]
42,"(Division, , of, , Austria, -, Hungary, ...","[Austria, World War ]",[Austria]
106,"(Spain, , also, , became, , a, , d...",[Spain],[Spain]
125,"(As, , Hitler, , "", s, "", , forces, ...",[the Soviet Union],[the Soviet Union]
144,"(debacle, , in, , France, , also, ...",[France],[France]
175,"(However, , a, , pro, -, allied, ...","[Yugoslavia, Yugoslavia]","[Yugoslavia, Yugoslavia]"
189,"(Mussolini, , had, , launched, , an, ...","[Italian, British, Egypt]",[Egypt]


## 6. Create a Relationship Dataframe

In [38]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [40]:
# Convert the list into a dataframe
relationship_df = pd.DataFrame(relationships)

In [42]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)

In [44]:
# Summarize the interactions by giving a value for every interaction captured, then group the interactions
relationship_df["value"] = 1
relationship_df_grouped = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

# the output
relationship_df_grouped.head(10)

Unnamed: 0,source,target,value
0,Austria,Russia,5
1,Germany,the United States,5
2,Russia,the United States,7
3,China,Japan,9
4,Australia,Japan,4
5,Japan,the United States,14
6,China,the United States,5
7,Germany,Japan,3
8,Cuba,the United States,2
9,the Soviet Union,the United States,5


## 7. Export the Data


In [47]:
# Save the dataframe as a csv file
relationship_df_grouped.to_csv(os.path.join(path2, 'country_relationships.csv'))