### This script contains the following:
1. Importing Libraries
2. Importing Data
3. Data Wragling
4. Exporting the Data

## Importing libraries

In [14]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re
import nltk
from nltk.tokenize import word_tokenize

## Importing Data

In [16]:
# Load the article
path = os.path.join(r'C:\Users\hp\20th-century\Data', '20th Century Events 1.4.txt')
    
with open(path, 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [17]:
# Import the list of countries as a dataframe
path2 = r'C:\Users\hp\20th-century\Data'

countries = pd.read_csv(os.path.join(path2, 'countries_list_20th_century_1.5.csv'), index_col = 0)

## 3.Data Wragling:

### 3.1. The article 

In [20]:
tokenized_word = word_tokenize(data)

In [21]:
# Remove the word 'the' from the text
stop_word = ['the', 'The']
filtered_words = [] 
for word in tokenized_word:
    if word not in stop_word:
        filtered_words.append(word)

In [22]:
# Substitute all punctuations marks with a space 
sans_punctuation = re.sub(r"[',([)\]]",  
                    " ",  
                    str(filtered_words))

sans_numbers = re.sub(r" \d+",
                      "",
                      str(sans_punctuation))

In [23]:
# Replacing shorthand country names in the txt file
adding_ussr = sans_numbers.replace('USSR', 'Soviet Union')
adding_us = adding_ussr.replace('U.S.', 'United States').replace('US', 'United States').replace('America', 'United States')
changing_gbr = adding_us.replace('Great Britain', 'United Kingdom').replace('Britain', 'United Kingdom')


In [24]:
changing_gbr = changing_gbr.replace('Soviet Union', 'the Soviet Union').replace('United States', 'the United States').replace('United Kingdom', 'the United Kingdom')

### 3.2. Countries csv: 

In [26]:
countries.shape


(209, 1)

In [27]:
countries.head()

Unnamed: 0_level_0,country_name
Column1,Unnamed: 1_level_1
1.0,Afghanistan
2.0,Albania
3.0,Algeria
4.0,Andorra
5.0,Angola


In [28]:
#Adding countries that have ceased to exist and were not on the list before
new_countries = [['Soviet Union'], ['Yugoslavia'], ['Czechoslovakia'], ['Burma'], ['Guam']]
df_new_countries = pd.DataFrame(new_countries, columns = ['country_name'])
combined_countries = pd.concat([countries, df_new_countries], ignore_index=True)
combined_countries

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola
...,...
209,Soviet Union
210,Yugoslavia
211,Czechoslovakia
212,Burma


In [29]:
combined_countries['country_alias'] = combined_countries['country_name']
combined_countries

Unnamed: 0,country_name,country_alias
0,Afghanistan,Afghanistan
1,Albania,Albania
2,Algeria,Algeria
3,Andorra,Andorra
4,Angola,Angola
...,...,...
209,Soviet Union,Soviet Union
210,Yugoslavia,Yugoslavia
211,Czechoslovakia,Czechoslovakia
212,Burma,Burma


In [30]:
# Create aliases for countries to match how they are referred in the article
combined_countries['country_alias'] = combined_countries['country_alias'].replace("  China, People's Republic of ", "China")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   Micronesia, Federated States of ", "Micronesia")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   North Macedonia ", "Macedonia")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("  Bosnia and Herzegovina ", "Bosnia")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("  East Timor ", "Timor")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   São Tomé and Príncipe ", "Príncipe")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   Korea, North ", "North Korea")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   Korea, South ", "South Korea")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("Soviet Union", "the Soviet Union")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   United States ", "the United States")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   United Kingdom ", "the United Kingdom")

In [31]:
country_list = combined_countries['country_alias'].to_list()


In [32]:
# For every item in the list, take out the additional spaces and add them to a new list
cleaned_country_list = [
    str(item).strip() if isinstance(item, str) else str(item).strip() for item in country_list
]

In [33]:
# Convert the list to a dataframe and rename the column
df_cleaned_countries = combined_countries
df_cleaned_countries['clean_country_alias'] = cleaned_country_list

In [34]:
# Check the output
df_cleaned_countries

Unnamed: 0,country_name,country_alias,clean_country_alias
0,Afghanistan,Afghanistan,Afghanistan
1,Albania,Albania,Albania
2,Algeria,Algeria,Algeria
3,Andorra,Andorra,Andorra
4,Angola,Angola,Angola
...,...,...,...
209,Soviet Union,the Soviet Union,the Soviet Union
210,Yugoslavia,Yugoslavia,Yugoslavia
211,Czechoslovakia,Czechoslovakia,Czechoslovakia
212,Burma,Burma,Burma


## 4. Exporting the Data

In [36]:
# Define the directory path
dir_path = r'C:\Users\hp\20th-century\Data'
# Join with the filename
path3 = os.path.join(dir_path, '20th Century Events_sans_punc.txt')
# Write to the file
with open(path3, 'w') as f:
    f.write(changing_gbr)

In [60]:
# Save the new countries list dataframe in the Data folder
df_cleaned_countries.to_csv(os.path.join(path2, 'cleaned_countries_list.csv'))