### Import the libraries

In [4]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from IPython.display import display, HTML
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [8]:
# Download English module

!python -m spacy download en_core_web_sm

C:\Users\nasim\anaconda3\python.exe: No module named spacy


In [9]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

### 1.	Load the twentieth-century text file.

In [10]:
# Load the book
with open('20th_century_wiki.txt', 'r', errors='ignore') as file:data = file.read().replace('\n', '')

In [11]:
# create NER object
book = NER(data)

In [15]:
# Check what the applied NER algorithm discovered in the book as an initial output:

from spacy import displacy
from IPython.display import display, HTML

html = displacy.render(book[273:20000], style="ent", jupyter=False)
display(HTML(html))

### 2. Get named entity list per sentence

#### Put together a separate list of the various different entities you might find in a sentence.

In [16]:
df_sentences = [] 

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [17]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Navigation\..."
1,"(World, War, II, (, 1939â€“1945)1.4.1The, war,...","[World War II, Holocaust1.4.12The Nuclear Age ..."
2,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
3,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
4,"(Historic, events, in, the, 20th, century[edit...","[the 20th, Edwardian, the 20th century]"
5,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
6,"(1914, saw, the, completion, of, the, Panama, ...","[1914, the Panama Canal]"
7,"(The, Scramble, for, Africa, continued, in, th...","[Scramble, Africa, the 1900s]"
8,"(The, atrocities, in, the, Congo, Free, State,...",[the Congo Free State]
9,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"


### 3. Load country names

##### Use the scraped characters list as a lookup dictionary to remove parts of the book where the main characters come into play

In [20]:
# Import characters
country_df = pd.read_csv("country_list.csv", index_col = 0)
country_df.head()

Unnamed: 0,Category,Country
0,A,Afghanistan
1,A,Albania
2,A,Algeria
3,A,Andorra
4,A,Angola


### Filtering entities from the book

#### To pass the sentence entities and the country dataframe into a filter, which will return only the entities of interest:

In [21]:
# Function to filter out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list 
            if ent in list(country_df['Country'])]

In [26]:
# Use the newly defined function to create a column in the book’s sentence dataframe so that it contains one or more of the entities (country name)

df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [27]:
df_sentences['country_entities'].head(40)

0                     []
1                     []
2                     []
3                     []
4                     []
5                     []
6                     []
7                     []
8                     []
9                     []
10                    []
11                    []
12                    []
13      [France, Russia]
14     [Germany, Russia]
15             [Germany]
16             [Germany]
17                    []
18                    []
19                    []
20                    []
21             [Germany]
22                    []
23                    []
24                    []
25                    []
26                    []
27                    []
28                    []
29                    []
30                    []
31                    []
32                    []
33                    []
34                    []
35                    []
36                    []
37                    []
38      [Germany, Italy]
39    [Germany, Germany]


In [28]:
# Filter out sentences that don't have any country entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [30]:
df_sentences_filtered.head(10)

Unnamed: 0,sentence,entities,country_entities
13,"(After, a, period, of, diplomatic, and, milita...","[the July Crisis, the end of July 1914, the Br...","[France, Russia]"
14,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[Bolsheviks, Germany, Russia]","[Germany, Russia]"
15,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
16,"(It, also, recognized, the, independence, of, ...","[Germany, Allied, American]",[Germany]
21,"(Many, Germans, felt, these, reparations, were...","[Germans, Germany, Allied, Kaiser, Europe]",[Germany]
38,"(Germany, ,, 1933Fascism, first, appeared, in,...","[Germany, first, Italy, Benito Mussolini, 1922...","[Germany, Italy]"
39,"(The, ideology, was, supported, by, a, large, ...","[Adolf Hitler, Germany, 1933, Nazism, Germany,...","[Germany, Germany]"
40,"(The, Nazi, Party, in, Germany, was, dedicated...","[The Nazi Party, Germany, German, German, Cent...","[Germany, Germany]"
41,"(Antisemitism, during, the, Great, Depression,...","[the Great Depression, Jews, Austria, Austria,...","[Austria, Austria, Germany]"
44,"(Almost, all, of, the, new, democracies, in, t...","[Eastern Europe, Spain, Francisco Franco, the ...",[Spain]


Only the rows containing the country name entities are included


### Create relationships

##### In order to conduct a relationships analysis, you need to find out how frequently different countries' names appear together throughout the text. The underlying idea here is that if two entities appear in close proximity to each other in a couple of sentences in a row (multiple times), then they must have a close connection.

In [32]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] 

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [33]:
# Create a relationship dataframe
relationship_df = pd.DataFrame(relationships)

In [34]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Russia
4,France,Russia
...,...,...
655,India,Singapore
656,India,Singapore
657,India,Singapore
658,India,Singapore


##### The encounters between characters should be summarized to understand how often different country names interact with each other, rather than only focusing on repeated interactions of the same pair:

In [35]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(10)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Germany,Russia
3,Germany,Russia
4,France,Russia
5,Germany,Russia
6,Germany,Russia
7,Germany,Russia
8,France,Russia
9,Germany,Russia


In [36]:
# Use the groupby() function to get all the different frequencies of the source

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [38]:
relationship_df.head(20)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,21
2,Germany,Italy,26
3,Austria,Germany,11
4,Germany,Spain,3
5,France,Spain,1
6,France,Poland,15
7,France,Germany,19
8,Germany,Poland,24
9,Estonia,Germany,5


In [40]:
relationship_df.to_csv('country_relationship.csv')