In [1]:
from bs4 import BeautifulSoup as bs
import lxml
import nltk
import os
import string

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pandas as pd

df = pd.read_json("people.json")

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import numpy as np
np.random.seed(400)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def extract_info(filepath):
    dictionary_of_interest = {}

    with open(filepath,"r",encoding="utf8") as file:
            
        content = file.readlines()
        content = "".join(content)
        
        bs_content = bs(content, "lxml")

        unique_id = bs_content.find("tei").attrs["xml:id"]

        letter_details = bs_content.find_all("correspaction")

        for deets in letter_details:

            if deets.attrs["type"] == "sent":

                try:
                    dictionary_of_interest["unique_id"] = unique_id
                    dictionary_of_interest["sender"] = deets.persname.text

                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text

                if "when" in list(deets.date.attrs.keys()):
                    dictionary_of_interest["date"] = deets.date.attrs["when"]

                try:
                    dictionary_of_interest["sender_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                    
            if deets.attrs["type"] == "received":
                
                try:
                    dictionary_of_interest["reciever"] = deets.persname.text
                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text
                    
                try:
                    dictionary_of_interest["reciever_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                    
        try:
            free_text = bs_content.find_all("div",{"type":"transcription"})[0].p.text
        except AttributeError:
#             print(bs_content) 
            free_text = ""

        # cleaning of the data
        free_text = free_text.lower().translate(str.maketrans('','',string.punctuation))
        go_away_chars = ['’', '“', '‘', '〈', '〉', '–', '♂', '…', '♀', '〈', '〉', '☿', '§', '⊙', '▵', '∴', '„', '✓']
        for char in go_away_chars:
            free_text = str.replace(free_text, char, "_")
        dictionary_of_interest["body"] = free_text

        file.close()

    return dictionary_of_interest

def generate_feature_data(free_text,feature_set):
    
    feature_bools = []
    
    for word in feature_set:
        feature_bools.append(1*(word in free_text))
        
    return feature_bools

def convert_dictionary_to_dataset_for_gender(data_set,feature_words,sender):
    
    # test = generate_feature_data(data_set[0]["body"],feature_words)
    complete_data = []
    incomplete_data = []
    targets = []
    
    occupations = df["occupation"].unique()
    
    skip_occs = []
    for occ in occupations:
        dft = df[df["occupation"]==occ]
        
        if dft.shape[0] <= 10:
            skip_occs.append(occ)
            
    for dictionary in not_darwin_dict:
        free_text = dictionary["body"]
        try:
            person_id = dictionary[sender]
        except:
            print(dictionary)
            continue
        number_key = person_id[21:-4]

        boolean_set = generate_feature_data(free_text,feature_words)
        try:
            dft = df[df["id"]=="DCP-IDENT-"+str(number_key)]
            gender = dft["occupation"].iloc[0]
    #         if dft["name"].iloc[0] == "John Jenner Weir":
    #             print(gender)
            if gender == "no common occupation" or gender in skip_occs:
                incomplete_data.append(boolean_set)
                continue
    #             gender = "NotAvailable"
    #             print(number_key)
        except:
            continue
    #         gender = "NotAvailable"
    #         print(number_key)
    #         print(reciever_id)
    #         print(df[df["id"]=="DCP-IDENT-"+str(number_key)])

        complete_data.append(boolean_set)
        targets.append(gender)
        
    return complete_data,incomplete_data,targets

def file_to_features(filepath):
    
    with open(filepath, encoding="utf8") as f:
        lines = f.readlines()

    lines_cleaned = []
    for word in lines:
        word = word[:-1]
        if word not in stopwords.words():
            lines_cleaned.append(word)

    feature_words = lines_cleaned
    
    f.close()
    return feature_words

In [3]:
path = "dcp-data/letters/"
files = os.listdir(path)
print(files[0],files[-1])
# to be commented out depending on who is running the code (lol) 
files = files[1:]
words_darwin = []
words_not_darwin = []

i = 0

not_darwin_dict = []

for file_target in files:
    dict_cur = extract_info(path+file_target)
    cur_words = words_not_darwin
    if "sender" in dict_cur.keys() and dict_cur["sender"] == "Darwin, C. R.":
        cur_words = words_darwin
#     else:
        not_darwin_dict.append(dict_cur)
    text_tokens = word_tokenize(dict_cur["body"])

    for word in text_tokens:
        if len(word) == 1 and not(word in ["i","a"]):
            continue
        else:
            cur_words.append(word)
            
    i += 1
    print(round((i/len(files))*100,2),end="\r"*(i!=len(files)))

print("")
print("Analysis Finished")
print(f"When all words are extracted, we have got a dataset of {len(words_not_darwin)} words in letters TO Darwin")
print(f"When all words are extracted, we have got a dataset of {len(words_darwin)} words in letters FROM Darwin")

.ipynb_checkpoints DCP-LETT-9999.xml
100.0
Analysis Finished
When all words are extracted, we have got a dataset of 593478 words in letters TO Darwin
When all words are extracted, we have got a dataset of 695012 words in letters FROM Darwin


In [4]:

# if "dump.txt" not in os.listdir("."):
#     unique_words = {}
#     for counter, word in enumerate(words):
#         try:
#             unique_words[word] += 1
#         except KeyError:
#             unique_words[word] = 1
#         print(round(((counter+1)/len(words))*100,2),end="\r")

#     sorted_unique_words = {key: value for key, value in sorted(unique_words.items(), key=lambda item: item[1],reverse=True)}

# #     print(list(sorted_unique_words.keys())[:1000])
#     feature_words_unclean = list(sorted_unique_words.keys())[:5000]
#     feature_words = []
#     for word in feature_words_unclean:
#         if word not in stopwords.words():
#             feature_words.append(word)
            
#     with open("dump.txt","w",encoding="utf8") as output:
#         for word in feature_words:
#             try:
#                 output.write(word +"\n")
#             except:
#                 print(word)

#     output.close()

feature_words = file_to_features("dump_not_darwin.txt")
# print(sum(list(sorted_unique_words.values())[:4000]))
# print(sum(list(sorted_unique_words.values())[4000:]))

In [5]:
# test = generate_feature_data(data_set[0]["body"],feature_words)
complete_data,incomplete_data,targets = convert_dictionary_to_dataset_for_gender(not_darwin_dict,feature_words,"sender_bio")

In [6]:
unique_tags = {}
for val in targets:
    try:
        unique_tags[val] += 1
    except:
        unique_tags[val] = 1
        
print(unique_tags)

sorted_unique_words = {key: value for key, value in sorted(unique_tags.items(), key=lambda item: item[1],reverse=True)}

print(list(sorted_unique_words.values())[0]/sum(sorted_unique_words.values()))

{'publisher': 239, 'zoologist': 413, 'traveller': 103, 'botanist': 1306, 'philosopher': 25, 'naturalist': 358, 'politician': 115, 'mathematician': 146, 'civil servant': 91, 'clergyman': 254, 'journalist': 16, 'barrister': 54, 'entomologist': 179, 'printer': 1, 'anatomist': 43, 'jurist': 13, 'lawyer': 34, 'surveyor': 21, 'teacher': 9, 'explorer': 24, 'architect': 4, 'businessman': 12, 'naval officer': 73, 'orientalist': 6, 'banker': 183, 'mining engineer': 20, 'physicist': 23, 'author': 185, 'statesman': 17, 'geologist': 214, 'librarian': 6, 'bookseller': 12, 'mineralogist': 10, 'embryologist': 13, 'editor': 68, 'gardener': 26, 'palaeontologist': 101, 'chemist': 64, 'nurseryman': 23, 'physiologist': 71, 'schoolteacher': 25, 'anthropologist': 21, 'civil engineer': 28, 'poet': 8, 'army officer': 23, 'diplomat': 30, 'engineer': 11, 'writer': 110, 'pharmacist': 60, 'inventor': 2, 'solicitor': 25, 'administrator': 1, 'missionary': 9, 'merchant': 8, 'social reformer': 6, 'judge': 20, 'novelis

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    complete_data, targets, test_size=0.4, random_state=42,
)

In [8]:
classifier = KNeighborsClassifier(5,weights="distance")
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)
score = classifier.score(X_train, y_train)
print(score)

0.20870425321463898
0.9927440633245382


In [9]:
predictions = classifier.score(complete_data,targets)
print(predictions)

0.6790660862683023


In [10]:
classifier = DecisionTreeClassifier(max_depth=10)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)


In [11]:
classes = classifier.predict_proba(X_test)
test = classifier.predict(X_test)
print(test[0])
print(y_test[0])
types = classifier.classes_
for i,row in enumerate(classes):
    sorted_row = sorted(row,reverse=True)
    print(len(sorted_row))
    temp_dict = {}
    for val in sorted_row:
        ind = np.where(row == val)[0][0]
        temp_dict[types[ind]] = val
    print(temp_dict)
    try:
        print(f"They were actually a {y_test[i]}, we predicted this with {temp_dict[y_test[i]]} certainty")
    except:
        print("Missed")

botanist
banker
69
{'botanist': 0.2520193861066236, 'zoologist': 0.08117932148626818, 'naturalist': 0.060177705977382874, 'clergyman': 0.051696284329563816, 'geologist': 0.04159935379644588, 'banker': 0.03998384491114701, 'author': 0.03877221324717286, 'entomologist': 0.036752827140549275, 'mathematician': 0.0327140549273021, 'publisher': 0.03150242326332795, 'politician': 0.027867528271405494, 'writer': 0.025040387722132473, 'palaeontologist': 0.023424878836833602, 'civil servant': 0.018578352180936994, 'physiologist': 0.016558966074313407, 'traveller': 0.01615508885298869, 'naval officer': 0.014539579967689823, 'barrister': 0.012924071082390954, 'pharmacist': 0.01050080775444265, 'anatomist': 0.008885298869143781, 'lawyer': 0.008077544426494346, 'civil engineer': 0.007269789983844911, 'solicitor': 0.006865912762520194, 'farmer': 0.006462035541195477, 'gardener': 0.004846526655896607, 'anthropologist': 0.0044426494345718905, 'army officer': 0.004038772213247173, 'judge': 0.00363489499

In [12]:
predictions = classifier.predict(incomplete_data)
print(set(predictions))

{'banker', 'traveller', 'naturalist', 'author', 'barrister', 'civil servant', 'botanist', 'zoologist', 'publisher', 'naval officer', 'clergyman', 'writer', 'editor', 'palaeontologist', 'politician', 'jurist', 'geologist'}


In [13]:
out = classifier.predict(complete_data)
# classifier.predict(test)

In [14]:
# for i,val in enumerate(out):
#     if val != targets[i]:
#         print(val,targets[i],i)
#         print(not_darwin_dict[i]["body"])
#         if val == "M" and targets[i] == "F":
#             print(not_darwin_dict[i]["sender"])
#             print(not_darwin_dict[i]["sender_bio"])
#             tag = not_darwin_dict[i]["sender_bio"][21:-4]
#             try:
#                 print(df[df["id"]=="DCP-IDENT-"+tag]["sex"].iloc[0])
#                 print(df[df["id"]=="DCP-IDENT-"+tag])
#             except:
#                 pass

In [15]:
# print(len(classifier.feature_importances_))

test_words = []
counter = 0
for i,val in enumerate(classifier.feature_importances_):
    if val != 0:
        counter += 1
        print(feature_words[i],val)
        
print(counter)

letter 0.008030201003028092
may 0.0076478104790743745
hope 0.007341898059911397
first 0.004588686287444624
long 0.007265419955120653
dr 0.008958863704058557
since 0.007341898059911397
thank 0.014175884629223428
way 0.004588686287444624
days 0.006883029431166937
subject 0.008392142835861297
believe 0.0061182483832594985
another 0.009177372574889249
soon 0.00955976309884297
answer 0.006883029431166937
seems 0.014399734587742874
came 0.01300127781442643
friend 0.009567145155289935
three 0.025101173059621975
beg 0.010827689573005289
edition 0.07339820565234752
whole 0.012106096311345856
plant 0.037133676878568174
kindly 0.010346474255902672
says 0.010203789244449243
full 0.0159074457964747
set 0.01166291098058841
origin 0.007341898059911397
line 0.008030201003028092
idea 0.008958863704058545
wh 0.04093286790687485
friends 0.007341898059911397
yr 0.004588686287444624
printed 0.004588686287444624
seed 0.0152907883625555
regret 0.004588686287444624
animal 0.014597543257672917
add 0.0181163664

In [16]:
classifier = AdaBoostClassifier()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)
score = classifier.score(X_train, y_train)
print(score)

0.26409495548961426
0.2691292875989446


In [17]:
for i,val in enumerate(classifier.feature_importances_):
    if val != 0:
        print(feature_words[i],val)

edition 0.02
20th 0.32
book— 0.32
enfin 0.34


In [18]:
for i,dictionary in enumerate(not_darwin_dict):
    
    free_text = dictionary["body"]
    
    try:
        reciever_id = dictionary["reciever_bio"]
    except:
        print(dictionary)
        continue
    number_key = reciever_id[21:-4]
    
    dft = df[df["id"]=="DCP-IDENT-"+str(number_key)]
    
    try:
        key_words = dft["keywords"].iloc[0]
    except:
        key_words = "None"
    print(key_words)
    if i == 100:
        break

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [19]:
test = [1 for i in range(10)]
print(test[:16])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
