# MSikos Exercise 1.6 Task - Intro to NLP and Network Analysis

Link to Github repository: https://github.com/MSikos/20th-Century

## 1. Import Libraries

In [61]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import scipy
import re
import matplotlib.pyplot as plt

In [62]:
# set path for data files
path = r'C:\Users\Stony\OneDrive\CareerFoundry\Data Specialiaztion Course\Achievement 1\Repositories\20th-Century'

In [63]:
%%capture
# Download English module
!python -m spacy download en_core_web_sm

In [64]:
# Download English module
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [65]:
# Load spacy English module
NER = spacy.load("en_core_web_sm")

## 2. Load 20th Century text file

In [66]:
# Load the text file

with open(os.path.join(path, '02 Data', 'Prepared Data', '20thCentury_Events_Updated.txt'), 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [67]:
print(data)



In [68]:
# Clean and Wrangle text file by removing citation numbers in brackets from body of text
Updated_20thCentury_Events = re.sub(r'\[\d+\]', '', data)  

In [69]:
# Write the cleaned text back to a new file
with open('Updated_20thCentury_Events.txt', 'w') as file:
    file.write(Updated_20thCentury_Events)

In [70]:
# Load the cleaned text file

with open('Updated_20thCentury_Events.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [71]:
# Clean and Wrangle text file by removing the [edit] text by every header
Cleaned_20thCentury_Events = re.sub(r'\[edit]', '', data)

In [72]:
# Write the cleaned text back to a new file
with open('Cleaned_20thCentury_Events.txt', 'w') as file:
    file.write(Cleaned_20thCentury_Events)

In [73]:
print(Cleaned_20thCentury_Events[:1000])

From Wikipedia, the free encyclopediaThe 20th century changed the world in unprecedented ways. The World Wars sparked tension between countries and led to the creation of atomic bombs, the Cold War led to the Space Race and the creation of space-based rockets, and the World Wide Web was created. These advancements have played a significant role in citizens' lives and shaped the 21st century into what it is today.Historic events in the 20th centuryWorld at the beginning of the centuryMain article: Edwardian eraThe new beginning of the 20th century marked significant changes. The 1900s saw the decade herald a series of inventions, including the automobile, airplane and radio broadcasting.From 1914 to 1918, the First World War, and its aftermath, caused major changes in the power balance of the world, destroying or transforming some of the most powerful empires."The war to end all wars": World War I (1914–1918)Main article: World War IArrest of a suspect in Sarajevo following the Assassin

In [74]:
book = NER(data)

In [75]:
# Visualize identified entities
displacy.render(book[273:373], style = "ent", jupyter = True)

## 3. Get named entity list per sentence

In [76]:
df_sentences = [] # empty shell to store results

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})
    
df_sentences = pd.DataFrame(df_sentences)

In [77]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,"(From, Wikipedia, ,, the, free, encyclopediaTh...","[Wikipedia, encyclopediaThe 20th century]"
1,"(The, World, Wars, sparked, tension, between, ...","[The World Wars, the Cold War, the Space Race,..."
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(Historic, events, in, the, 20th, century[edit...","[the 20th, Edwardian, the 20th century]"
4,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
5,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"
6,"(""The)",[]
7,"(war, to, end, all, wars, "", :, World, War, I,...","[World War I, World War IArrest, Sarajevo, Arc..."
8,"(The, war, and, by, extension, the, century, a...","[the century, Sarajevo, the Austro-Hungarian E..."
9,"(This, was, similar, to, how, the, 9/11, was, ...","[9/11, the 21st century]"


## 4. Load Country names

In [78]:
# Import countries
countries_df = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', "List of countries.csv"), index_col = 0)

In [79]:
countries_df.head()

Unnamed: 0,country_name
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola


## 5. Filtering countries from the text file

In [80]:
# Function to filter out entities not of interest

def filter_entity(ent_list, countries_df):
    return [ent for ent in ent_list 
            if ent in list(countries_df['country_name'])]

In [81]:
df_sentences['character_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, countries_df))

In [82]:
df_sentences['character_entities'].head(20)

0                                        []
1                                        []
2                                        []
3                                        []
4                                        []
5                                        []
6                                        []
7                                        []
8                                        []
9                                        []
10                                       []
11                                       []
12                         [France, Russia]
13    [Germany, Austria, Hungary, Bulgaria]
14                                 [Russia]
15                        [Germany, Russia]
16                                [Germany]
17                                [Ukraine]
18                                [Germany]
19                                       []
Name: character_entities, dtype: object

In [83]:
# Filter out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['character_entities'].map(len) > 0]

In [84]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,character_entities
333,"(After, the, French, withdrawal, from, its, fo...","[French, 21 July 1954, Vietnam, two, Korea, th...",[Vietnam]
339,"(The, plan, went, awry, ,, with, Nixon, , del...","[Nixon, Cambodia, South Vietnamese]",[Cambodia]
342,"(Saigon, was, captured, on, 30, April, 1975, ,...","[Saigon, 30 April 1975, Vietnam, Communist, a ...",[Vietnam]
344,"(Cuba, ,, under, Fidel, Castro, 's, socialist,...","[Cuba, Fidel Castro's, the Soviet Union]",[Cuba]
345,"(This, was, obviously, disquieting, to, the, U...","[the United States, Cuba]",[Cuba]
349,"(Main, article, :, Space, RaceWith, Cold, War,...","[Space RaceWith Cold War, the Soviet Union, Un...",[United States]
358,"(In, the, 1990s, ,, work, on, the, Internation...","[the 1990s, the International Space Station, t...","[Russia, Japan, Canada]"
363,"(Mikhail, Gorbachev, ,, its, last, leader, ,, ...","[Mikhail Gorbachev, Solidarity, the Berlin Wal...",[Lithuania]
364,"(Boris, Yeltsin, ,, president, of, Russia, ,, ...","[Boris Yeltsin, Russia]",[Russia]
417,"(The, influence, of, China, and, India, was, a...","[China, India, West]",[India]


## 6. Create relationships

In [85]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [86]:
relationship_df = pd.DataFrame(relationships)

In [87]:
relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Austria
4,Austria,Hungary
...,...,...
588,Lithuania,Russia
589,Lithuania,Russia
590,Lithuania,Russia
591,Lithuania,Russia


In [88]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Germany,Russia
3,Austria,Germany
4,Austria,Hungary


In [89]:
# Summarize the interactions

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [90]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,France,Russia,12
1,Germany,Russia,26
2,Austria,Germany,16
3,Austria,Hungary,6
4,Bulgaria,Hungary,6
5,Bulgaria,Russia,5
6,Germany,Ukraine,10
7,Germany,Italy,26
8,France,Poland,14
9,France,Germany,24


In [91]:
relationship_df.to_csv(os.path.join(path, '02 Data', 'Prepared Data' "20thCentury_relationship2.csv"))