In [18]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

NER = spacy.load("en_core_web_sm")



In [19]:
# Load file
with open('key_event_20th_century.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

Evaluate whether the text needs wrangling—are there any special characters used? Are the names of the countries in your list the same as the names in the text? Write down your observations in a markdown cell and take the necessary steps to correct any issues you’ve found. If anything does need correcting, make sure you save your file as a .txt.

There minimal special characters throughout the text and they don't affect the country names. Also, the country names were the same. 

In [20]:
# Used the text file to create a NER object.
wikiPage = NER(data)

In [21]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in wikiPage.sents:
       entity_list = [ent.text for ent in sent.ents]
       df_sentences.append({"sentence": sent, "entities": entity_list})

# Create df of the sentences
df_sentences = pd.DataFrame(df_sentences)

df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"( , Key, events, of, the, 20th, century, -,...","[the 20th century -, Navigation \t ]"
1,"(Main, pageContentsCurrent)",[]
2,"(eventsRandom, articleAbout, WikipediaContact,...",[Contribute]
3,"(Create, account, , Log, in, , Person...","[Log, the 20th century, the 20th century, the ..."
4,"(The, Russian, Revolution, and, Communism, ...",[The Russian Revolution and Communism ...
5,"( , 1.3.1, The, war, in, Europe, ...","[Europe, 1.3.2, Operation Barbarossa 1..."
6,"(Turning, tides, , 1.3.5, Operation)",[]
7,"(Overlord, , 1.3.6)",[]
8,"(Final, days, , 1.3.7)",[]
9,"(The, war, in, the, Pacific, , 1.3.7.1, ...",[]


In [22]:
# create df of countries to turn into list of country
df_countries = pd.read_csv("countries_list_20th_century_1.5.csv")
# use lambda function to shorten country name to alias
df_countries['country_alias'] = df_countries['country_name'].apply(lambda x: x.rsplit(',',1)[0])
# remove spaces
df_countries['country_alias'] = df_countries['country_alias'].apply(lambda x: x.replace(" ",""))

In [23]:
# Function to filter out entities not of interest
def filter_entity(ent_list, df):
       return [ent for ent in ent_list if ent in list(df['country_alias'])] # change to correct 

# Use function on df via lambda
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, df_countries))

In [24]:
# Filter out sentences that don’t have any character entities
# map() method returns a new series (or column in a dataframe) containing the length of each element in the “character_entities” column.
df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1130,"("", The, forgotten, violence, that, helped, In...",[India],[India]
1134,"("", Indian, Independence, Day, :, everything, ...","[Indian Independence Day, India, Pakistan, 70 ...","[India, Pakistan]"
1142,"( , ^, "", The, Philippines, ,, 1898â€“1946, |,...","[Philippines, 1898â€“1946, House of Representa...",[Philippines]
1172,"("", Colonial, Cartographies, ,, Postcolonial, ...","[Colonial Cartographies, Enduring Failures of ...",[Afghanistan]
1207,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1265,"("", Selling, "", Operation, Passage, to, Freedo...","[Selling ""Operation Passage to Freedom, Thomas...",[Vietnam]
1296,"("", Stuck, in, Endless, Preliminaries, :, Viet...","[Vietnam, the Battle of the Paris Peace Table,...",[Vietnam]
1535,"("", Anti, -, American, Behavior, in, the, Midd...","[Anti-American, the Middle East, Lebanon]",[Lebanon]
1540,"(The, Rise, of, China, and, India, :, A, New, ...",[India],[India]
1541,"(Singapore, :, World, Scientific, ., doi:10.11...","[Singapore, World Scientific]",[Singapore]


In [25]:
# Defining relationships

relationships = []

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])

    char_unique = []
    for i in range(len(char_list)):
        if (i==0) or char_list[i] != char_list[i-1]:
            char_unique.append(char_list[i])

    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [54]:
# Sort numpy sort to sort df 
relationships_df_unsort = pd.DataFrame(relationships)
relationships_df = pd.DataFrame(np.sort(relationships_df_unsort.values, axis = 1), columns = relationships_df_unsort.columns)

In [55]:
relationships_df["value"] = 1
# overwrote the df when grouping 
relationships_df = relationships_df.groupby(["source","target"], sort=False, as_index=False).sum()
relationships_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,26
2,Austria,Germany,17
3,Austria,Hungary,6
4,Bulgaria,Hungary,6
5,Bulgaria,Russia,5
6,Germany,Italy,29
7,France,Poland,11
8,France,Germany,28
9,Germany,Poland,27


In [53]:
# export relationship df for next task
relationships_df.to_csv('country_relationship.csv')