### This script contains the following:
1. Importing Libraries
2. Importing Data
3. Creating Named Entity Recognition Object
4. Splitting the Sentence Entities
5. Filtering the Entities Using the Country List
6. Creating a Relationship Dataframe
7. Exporting the Data

In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
import nltk
from nltk.tokenize import word_tokenize

In [3]:
!C:\Users\hp\anaconda3\envs\mining_env\python.exe -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 4.8 MB/s eta 0:00:03
     ----- ---------------------------------- 1.8/12.8 MB 4.6 MB/s eta 0:00:03
     -------- ------------------------------- 2.6/12.8 MB 4.7 MB/s eta 0:00:03
     ----------- ---------------------------- 3.7/12.8 MB 4.7 MB/s eta 0:00:02
     -------------- ------------------------- 4.7/12.8 MB 4.7 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 4.7 MB/s eta 0:00:02
     --------------------- ------------------ 6.8/12.8 MB 4.7 MB/s eta 0:00:02
     ----------------------- ---------------- 7.6/12.8 MB 4.7 MB/s eta 0:00:02
     --------------------------- ------------ 8.7/12.8 MB 4.5 MB/s eta 0:00:01
     ------------------------------ -----

In [30]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

## 2. Importing Data


In [24]:
# Load the article
path = os.path.join('..', 'Data', '20th Century Events_sans_punc.txt')
with open(path, 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [26]:
# Import the list of countries as a dataframe
path2 = os.path.join('..', 'Data')
countries = pd.read_csv(os.path.join(path2, 'cleaned_countries_list.csv'), index_col = 0)

In [None]:
countries.shape

In [None]:
countries.head()

## 3. Creating NER Object

In [28]:
# Set the NER object
article = NER(data)

NameError: name 'NER' is not defined

In [None]:
%%capture

# Visualize identified entities
displacy.render(article[273:20000], style = "ent", jupyter = True)

## 4. Splitting the Sentence Entities

In [None]:
# Create an empty shell to store results
df_sentences = [] 

# Loop through sentences, to get entity list for each sentence
for sent in article.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
# Convert the list into a dataframe
df_sentences = pd.DataFrame(df_sentences)

In [None]:
df_sentences.head(10)

## 5. Filter the Entities Using the Country List


In [None]:
# A function to filter out entities not on the cleaned countries list
def filter_entity(ent_list, countries):
    return [ent for ent in ent_list 
            if ent in list(countries['clean_country_alias'])]

In [None]:
# Apply the function and store the results in a new column
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries))

In [None]:
df_sentences.head(10)


In [None]:
# Filter out sentences that don't have any character entities
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]
df_sentences_filtered.head(10)

## 6. Create a Relationship Dataframe

In [None]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    country_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    country_unique = [country_list[i] for i in range(len(country_list)) 
                   if (i==0) or country_list[i] != country_list[i-1]]
    
    if len(country_unique) > 1:
        for idx, a in enumerate(country_unique[:-1]):
            b = country_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [None]:
# Convert the list into a dataframe
relationship_df = pd.DataFrame(relationships)

In [None]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)

In [None]:
# Summarize the interactions by giving a value for every interaction captured, then group the interactions
relationship_df["value"] = 1
relationship_df_grouped = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

# the output
relationship_df_grouped.head(10)

## 7. Export the Data


In [None]:
# Save the dataframe as a csv file
relationship_df_grouped.to_csv(os.path.join(path2, 'country_relationships.csv'))

In [None]:
relationship_df_grouped.shape