In [1]:
from bs4 import BeautifulSoup as bs
import lxml
import nltk
import os
import string

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def extract_info(filepath):
    dictionary_of_interest = {}

    with open(filepath,"r",encoding="utf8") as file:
            
        content = file.readlines()
        content = "".join(content)
        
        bs_content = bs(content, "lxml")

        unique_id = bs_content.find("tei").attrs["xml:id"]

        letter_details = bs_content.find_all("correspaction")

        for deets in letter_details:

            if deets.attrs["type"] == "sent":

                try:
                    dictionary_of_interest["unique_id"] = unique_id
                    dictionary_of_interest["sender"] = deets.persname.text

                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text

                if "when" in list(deets.date.attrs.keys()):
                    dictionary_of_interest["date"] = deets.date.attrs["when"]


                try:
                    dictionary_of_interest["sender_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                    
            if deets.attrs["type"] == "received":
                
                try:
                    dictionary_of_interest["reciever"] = deets.persname.text
                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text
                    
                try:
                    dictionary_of_interest["reciever_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                    
        try:
            free_text = bs_content.find_all("div",{"type":"transcription"})[0].p.text
        except AttributeError:
#             print(bs_content) 
            free_text = ""

        # cleaning of the data
        free_text = free_text.lower().translate(str.maketrans('','',string.punctuation))
        go_away_chars = ['’', '“', '‘', '〈', '〉', '–', '♂', '…', '♀', '〈', '〉', '☿', '§', '⊙', '▵', '∴', '„', '✓']
        for char in go_away_chars:
            free_text = str.replace(free_text, char, "_")
        dictionary_of_interest["body"] = free_text

        file.close()

    return dictionary_of_interest

def generate_feature_data(free_text,feature_set):
    
    feature_bools = []
    
    for word in feature_set:
        feature_bools.append(1*(word in free_text))
        
    return feature_bools

In [3]:
path = "dcp-data/letters/"
files = os.listdir(path)
# to be commented out depending on who is running the code (lol) 
files = files[1:]
files = files[:len(files)-1]
words_darwin = []
words_not_darwin = []

i = 0
cap = 0

if cap == 0:
    cap = len(files)

for file_target in files:
    dict_cur = extract_info(path+file_target)
    cur_words = words_not_darwin
    if "sender" in dict_cur.keys() and dict_cur["sender"] == "Darwin, C. R.":
        cur_words = words_darwin
    text_tokens = word_tokenize(dict_cur["body"])

    for word in text_tokens:
        if len(word) == 1 and not(word in ["i","a"]):
            continue
        else:
            cur_words.append(word)

    if i == cap:
        break
    elif i < cap:
        i += 1
    else:
        print("Failed loop")
        break
    print(round((i/cap)*100,2),end="\r"*(i!=cap))

print("")
print("Analysis Finished")
print(f"When all words are extracted, we have got a dataset of {len(words_not_darwin)} words in letters TO Darwin")
print(f"When all words are extracted, we have got a dataset of {len(words_darwin)} words in letters FROM Darwin")

100.0
Analysis Finished
When all words are extracted, we have got a dataset of 593445 words in letters TO Darwin
When all words are extracted, we have got a dataset of 695012 words in letters FROM Darwin


In [4]:
unique_both = {"darwin" : words_darwin, "not_darwin" : words_not_darwin}
for key in unique_both.keys():
    unique_words = {}
    words = unique_both[key]
    print(len(words))
    print("")
    for counter, word in enumerate(words):
        try:
            unique_words[word] += 1
        except KeyError:
            unique_words[word] = 1
        print(round(((counter+1)/len(words))*100,2),end="\r")

    sorted_unique_words = {key: value for key, value in sorted(unique_words.items(), key=lambda item: item[1],reverse=True)}

#     print(list(sorted_unique_words.keys())[:1000])

    feature_words = list(sorted_unique_words.keys())[:4000]

    with open("dump_" + key + ".txt","w",encoding="utf8") as output:
        for word in feature_words:
            try:
                output.write(word +"\n")
            except:
                print(word)

    output.close()
    unique_both[key] = feature_words
# print(dict_test)

695012

593445

100.0

In [5]:
# needs to be opened for both files
filenames = {"darwin": "dump_darwin.txt", "not_darwin": "dump_not_darwin.txt"}
feature_words_all = []
for key in filenames.keys():
    file = filenames[key]
    with open(file, "w", encoding="utf8") as output:
        for word in unique_both[key]:
            try:
                output.write(word +"\n")
            except:
                print(word)
        
    output.close()

    with open(file, encoding="utf8") as f:
        lines = f.readlines()
    
    lines_cleaned = []
    for word in lines:
        word = word[:-1]
        lines_cleaned.append(word)
    
    feature_words = lines_cleaned
    feature_words_all.append(feature_words)

# print(sum(list(sorted_unique_words.values())[:4000]))
# print(sum(list(sorted_unique_words.values())[4000:]))

In [6]:
# should check the sender before choosing which feature_words to use (feature_words_all[0] = darwin, feature_words_all[1] = not darwin)
#test = generate_feature_data(dict_test["body"],feature_words)
#print(dict_test["sender"])
#print(dict_test["reciever"])
#print(test)