In [1]:
from bs4 import BeautifulSoup as bs
import lxml
import nltk
import os
import string

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pandas as pd

df = pd.read_json("people.json")

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
np.random.seed(400)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def extract_info(filepath):
    dictionary_of_interest = {}

    with open(filepath,"r",encoding="utf8") as file:
            
        content = file.readlines()
        content = "".join(content)
        
        bs_content = bs(content, "lxml")

        unique_id = bs_content.find("tei").attrs["xml:id"]

        letter_details = bs_content.find_all("correspaction")

        for deets in letter_details:

            if deets.attrs["type"] == "sent":

                try:
                    dictionary_of_interest["unique_id"] = unique_id
                    dictionary_of_interest["sender"] = deets.persname.text

                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text

                if "when" in list(deets.date.attrs.keys()):
                    dictionary_of_interest["date"] = deets.date.attrs["when"]

                try:
                    dictionary_of_interest["sender_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                    
            if deets.attrs["type"] == "received":
                
                try:
                    dictionary_of_interest["reciever"] = deets.persname.text
                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text
                    
                try:
                    dictionary_of_interest["reciever_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                    
        try:
            free_text = bs_content.find_all("div",{"type":"transcription"})[0].p.text
        except AttributeError:
#             print(bs_content) 
            free_text = ""

        # cleaning of the data
        free_text = free_text.lower().translate(str.maketrans('','',string.punctuation))
        go_away_chars = ['’', '“', '‘', '〈', '〉', '–', '♂', '…', '♀', '〈', '〉', '☿', '§', '⊙', '▵', '∴', '„', '✓']
        for char in go_away_chars:
            free_text = str.replace(free_text, char, "_")
        dictionary_of_interest["body"] = free_text

        file.close()

    return dictionary_of_interest

def generate_feature_data(free_text,feature_set):
    
    feature_bools = []
    
    for word in feature_set:
        feature_bools.append(1*(word in free_text))
        
    return feature_bools

def convert_dictionary_to_dataset_for_gender(data_set,feature):
    
    complete_data = []
    targets = []
    for dictionary in data_set:
        free_text = dictionary["body"]
        try:
            reciever_id = dictionary["reciever_bio"]
        except:
            print(dictionary)
            continue
        number_key = reciever_id[21:-4]

        boolean_set = generate_feature_data(free_text,feature_words)
        try:
            gender = df[df["id"]=="DCP-IDENT-"+str(number_key)]["sex"].iloc[0]
            if gender == "":
                gender = "NotAvailable"
    #             print(number_key)
        except IndexError:
            continue
    #         gender = "NotAvailable"
    #         print(number_key)
    #         print(reciever_id)
    #         print(df[df["id"]=="DCP-IDENT-"+str(number_key)])

        complete_data.append(boolean_set)
        targets.append(gender)
        
    return complete_data,targets

In [3]:
path = "dcp-data/letters/"
files = os.listdir(path)
# to be commented out depending on who is running the code (lol) 
files = files[1:]
files = files[:len(files)-1]
words_darwin = []
words_not_darwin = []

i = 0

if cap == 0:
    cap = len(files)

for file_target in files:
    dict_cur = extract_info(path+file_target)
    cur_words = words_not_darwin
    if "sender" in dict_cur.keys() and dict_cur["sender"] == "Darwin, C. R.":
        cur_words = words_darwin
    text_tokens = word_tokenize(dict_cur["body"])

    for word in text_tokens:
        if len(word) == 1 and not(word in ["i","a"]):
            continue
        else:
            cur_words.append(word)

    if i == cap:
        break
    elif i < cap:
        i += 1
    else:
        print("Failed loop")
        break
    print(round((i/cap)*100,2),end="\r"*(i!=cap))

print("")
print("Analysis Finished")
print(f"When all words are extracted, we have got a dataset of {len(words_not_darwin)} words in letters TO Darwin")
print(f"When all words are extracted, we have got a dataset of {len(words_darwin)} words in letters FROM Darwin")

100.0
Analysis Finished
When all words are extracted, we have got a dataset of 593445 words in letters TO Darwin
When all words are extracted, we have got a dataset of 695012 words in letters FROM Darwin


In [6]:

if "dump.txt" not in os.listdir("."):
    unique_words = {}
    for counter, word in enumerate(words):
        try:
            unique_words[word] += 1
        except KeyError:
            unique_words[word] = 1
        print(round(((counter+1)/len(words))*100,2),end="\r")

    sorted_unique_words = {key: value for key, value in sorted(unique_words.items(), key=lambda item: item[1],reverse=True)}

#     print(list(sorted_unique_words.keys())[:1000])
    feature_words_unclean = list(sorted_unique_words.keys())[:5000]
    feature_words = []
    for word in feature_words_unclean:
        if word not in stopwords.words():
            feature_words.append(word)
            
    with open("dump.txt","w",encoding="utf8") as output:
        for word in feature_words:
            try:
                output.write(word +"\n")
            except:
                print(word)

    output.close()

with open('dump.txt', encoding="utf8") as f:
    lines = f.readlines()
    
lines_cleaned = []
for word in lines:
    word = word[:-1]
    lines_cleaned.append(word)
    
feature_words = lines_cleaned
# print(sum(list(sorted_unique_words.values())[:4000]))
# print(sum(list(sorted_unique_words.values())[4000:]))

In [17]:
test = generate_feature_data(data_set[0]["body"],feature_words)
complete_data = []
targets = []
for dictionary in data_set:
    free_text = dictionary["body"]
    try:
        reciever_id = dictionary["reciever_bio"]
    except:
        print(dictionary)
        continue
    number_key = reciever_id[21:-4]
    
    boolean_set = generate_feature_data(free_text,feature_words)
    try:
        dft = df[df["id"]=="DCP-IDENT-"+str(number_key)]
        gender = dft["sex"].iloc[0]
        if dft["name"].iloc[0] == "John Jenner Weir":
            print(gender)
        if gender == "":
            gender = "NotAvailable"
#             print(number_key)
    except:
#         continue
        gender = "NotAvailable"
#         print(number_key)
#         print(reciever_id)
#         print(df[df["id"]=="DCP-IDENT-"+str(number_key)])
        
    complete_data.append(boolean_set)
    targets.append(gender)
#     if counter == 10:
#         break
#     else:
#         counter += 1
# print(dict_test["sender"])
# print(dict_test["reciever"])
# print(test)

M
M
M
M
M
M
M
M
{'unique_id': 'DCP-LETT-2115F', 'sender': 'Darwin, C. R.', 'sender_bio': '../nameregs/nameregs_1.xml', 'body': '– july\xa01857—'}
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M
M


In [5]:
unique_tags = {}
for val in targets:
    try:
        unique_tags[val] += 1
    except:
        unique_tags[val] = 1
        
print(unique_tags)
print(list(unique_tags.values())[0]/sum(unique_tags.values()))

{'M': 7277, 'NotAvailable': 531, 'F': 324}
0.8948598130841121


In [41]:
X_train, X_test, y_train, y_test = train_test_split(
    complete_data, targets, test_size=0.4, random_state=42, stratify=targets
)

# def test_train_split_personal(data,targets,proportion = 0.5, seed = 200):
    
#     np.random.seed(seed)
    
#     train_data = []
#     test_data = []
#     train_targets = []
#     test_targets = []
#     for i,val in enumerate(data):
        
#         if np.random.random() >= proportion:
#             test_data.append(val)
#             test_targets.append(targets[i])
#         else:
#             train_data.append(val)
#             train_targets.append(targets[i])
            
#     return train_data, test_data, train_targets,  test_targets

# X_train, X_test, y_train, y_test = test_train_split_personal(complete_data, targets,proportion=0.4,)

In [42]:
classifier = KNeighborsClassifier(30)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)

0.8948662772825084


In [43]:
classifier = DecisionTreeClassifier(max_depth=10)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)

0.8881032892714418


In [35]:
print(y_test)

['M', 'M', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'F', 'F', 'M', 'M', 'F', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'NotAvailable', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'F', 'M', 'M', 'NotAvailable', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'M',

In [38]:
out = classifier.predict(complete_data)
# classifier.predict(test)

In [40]:
for i,val in enumerate(out):
    if val != targets[i]:
        print(val,targets[i],i)
        print(data_set[i]["body"])
        if val == "M" and targets[i] == "F":
            print(data_set[i]["reciever"])
            print(data_set[i]["reciever_bio"])
            tag = data_set[i]["reciever_bio"][21:-4]
            try:
                print(df[df["id"]=="DCP-IDENT-"+tag]["sex"].iloc[0])
                print(df[df["id"]=="DCP-IDENT-"+tag])
            except:
                pass

M NotAvailable 1
i fear the enclosed is longer than you wished but a page in annals swallows up much ms i cd not make it shorter to do justice to work or to make my little notice at all interesting
M NotAvailable 6
i hope that you will permit me to republish in a corrected form my paper on climbing plants which appears in the 9th vol 1865 of your journal i wish it the paper appear as a second part to a new work which i shall soon send to press— if you grant my request i further hope that you will be so good as to allow me to use the 13 woodblocks illustrating the paper  in this case i request that they may be sent to mr murray of albermarle st marked as for my intended volume
M NotAvailable 10
i am very much obliged to you for your great kindness in having made for m〈e〉 so beautiful a present as the fenderstool 1 line illeg in our drawing room 2 lines illeg grateful to you for going to so much trouble i will venture to send you soon a copy of a book just published by me viz insectivoro

In [14]:
# print(len(classifier.feature_importances_))

test_words = []
counter = 0
for i,val in enumerate(classifier.feature_importances_):
    if val != 0:
        counter += 1
        print(feature_words[i],val)
        
print(counter)

much 0.01591515895685456
mr 0.013087639396811523
send 0.014238201102025722
whether 0.01572674030814659
read 0.014155159113599963
thank 0.005608988312919224
glad 0.011836240110330692
give 0.014561796581617216
received 0.004206741234689419
subject 0.013574694404375932
specimens 0.0063101118520341275
told 0.015293080296974581
darwin 0.05890012243831705
form 0.007211556402324719
year 0.01325286540913007
father 0.05324280943617706
high 0.01985617194472785
seed 0.04035006453656507
observed 0.009821303127297643
order 0.014892639132190326
night 0.018546680060313205
allow 0.013945158561804552
remain 0.021929841560432126
w 0.01373958166585319
led 0.007572134222440954
experiment 0.00701123539114903
possess 0.0063101118520341275
formed 0.016322968078559055
mother 0.011485071307406032
captain 0.029840849356582065
journey 0.02284831886582357
easy 0.005608988312919224
15 0.012625561530784014
reached 0.016727982792059098
child 0.029875531643422864
alive 0.014131201170716964
transactions 0.028508193286

In [15]:
for i,dictionary in enumerate(data_set):
    
    free_text = dictionary["body"]
    
    try:
        reciever_id = dictionary["reciever_bio"]
    except:
        print(dictionary)
        continue
    number_key = reciever_id[21:-4]
    
    dft = df[df["id"]=="DCP-IDENT-"+str(number_key)]
    
    try:
        key_words = dft["keywords"].iloc[0]
    except:
        key_words = "None"
    print(key_words)
    if i == 100:
        break

[['name', 'Darwin, C. R.'], ['societal', 'Cambridge'], ['place', 'Cambridge'], ['place', 'Delamere'], ['place', 'Cheshire'], ['place', 'Sandown'], ['place', 'Isle of Wight'], ['p', 'Clergyman']]
None
[['nationality', 'German'], ['a', 'zoologist']]
[['society', 'FRS'], ['societal', 'Putney College for Civil Engineering '], ['societal', 'Owens College'], ['societal', 'St Bartholomew’s Hospital '], ['societal', 'Royal Institution'], ['societal', 'Royal College of Chemistry'], ['societal', 'Royal School of Mines'], ['societal', 'Chemical Society'], ['societal', 'Institute of Chemistry'], ['place', 'Putney'], ['place', 'Manchester'], ['place', 'London'], ['a', 'Chemist']]
[['society', 'Foreign member, Royal Society '], ['societal', 'Harvard University'], ['societal', 'American Academy of Arts and Sciences '], ['societal', 'American Association for the Advancement of Science '], ['societal', 'Smithsonian Institution'], ['societal', 'Royal Society'], ['place', 'Harvard'], ['place', 'North Ame