In [4]:
from bs4 import BeautifulSoup as bs
import lxml
import nltk
import os
import string

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pandas as pd

df = pd.read_json("people.json")

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import numpy as np
np.random.seed(400)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def extract_info(filepath):
    dictionary_of_interest = {}

    with open(filepath,"r",encoding="utf8") as file:
            
        content = file.readlines()
        content = "".join(content)
        
        bs_content = bs(content, "lxml")

        unique_id = bs_content.find("tei").attrs["xml:id"]

        letter_details = bs_content.find_all("correspaction")

        for deets in letter_details:

            if deets.attrs["type"] == "sent":

                try:
                    dictionary_of_interest["unique_id"] = unique_id
                    dictionary_of_interest["sender"] = deets.persname.text

                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text

                if "when" in list(deets.date.attrs.keys()):
                    dictionary_of_interest["date"] = deets.date.attrs["when"]

                try:
                    dictionary_of_interest["sender_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                    
            if deets.attrs["type"] == "received":
                
                try:
                    dictionary_of_interest["reciever"] = deets.persname.text
                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text
                    
                try:
                    dictionary_of_interest["reciever_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                    
        try:
            free_text = bs_content.find_all("div",{"type":"transcription"})[0].p.text
        except AttributeError:
#             print(bs_content) 
            free_text = ""

        # cleaning of the data
        free_text = free_text.lower().translate(str.maketrans('','',string.punctuation))
        go_away_chars = ['’', '“', '‘', '〈', '〉', '–', '♂', '…', '♀', '〈', '〉', '☿', '§', '⊙', '▵', '∴', '„', '✓']
        for char in go_away_chars:
            free_text = str.replace(free_text, char, "_")
        dictionary_of_interest["body"] = free_text

        file.close()

    return dictionary_of_interest

def generate_feature_data(free_text,feature_set):
    
    feature_bools = []
    
    for word in feature_set:
        feature_bools.append(1*(word in free_text))
        
    return feature_bools

def convert_dictionary_to_dataset_for_gender(data_set,feature_words,sender):
    
    # test = generate_feature_data(data_set[0]["body"],feature_words)
    complete_data = []
    incomplete_data = []
    targets = []
    for dictionary in not_darwin_dict:
        free_text = dictionary["body"]
        try:
            person_id = dictionary[sender]
        except:
            print(dictionary)
            continue
        number_key = person_id[21:-4]

        boolean_set = generate_feature_data(free_text,feature_words)
        try:
            dft = df[df["id"]=="DCP-IDENT-"+str(number_key)]
            gender = dft["occupation"].iloc[0]
    #         if dft["name"].iloc[0] == "John Jenner Weir":
    #             print(gender)
            if gender == "no common occupation":
                incomplete_data.append(boolean_set)
                continue
    #             gender = "NotAvailable"
    #             print(number_key)
        except:
            continue
    #         gender = "NotAvailable"
    #         print(number_key)
    #         print(reciever_id)
    #         print(df[df["id"]=="DCP-IDENT-"+str(number_key)])

        complete_data.append(boolean_set)
        targets.append(gender)
        
    return complete_data,incomplete_data,targets

def file_to_features(filepath):
    
    with open(filepath, encoding="utf8") as f:
        lines = f.readlines()

    lines_cleaned = []
    for word in lines:
        word = word[:-1]
        if word not in stopwords.words():
            lines_cleaned.append(word)

    feature_words = lines_cleaned
    
    f.close()
    return feature_words

In [7]:
path = "dcp-data/letters/"
files = os.listdir(path)
print(files[0],files[-1])
# to be commented out depending on who is running the code (lol) 
files = files[1:]
words_darwin = []
words_not_darwin = []

i = 0

not_darwin_dict = []

for file_target in files:
    dict_cur = extract_info(path+file_target)
    cur_words = words_not_darwin
    if "sender" in dict_cur.keys() and dict_cur["sender"] == "Darwin, C. R.":
        cur_words = words_darwin
    else:
        not_darwin_dict.append(dict_cur)
    text_tokens = word_tokenize(dict_cur["body"])

    for word in text_tokens:
        if len(word) == 1 and not(word in ["i","a"]):
            continue
        else:
            cur_words.append(word)
            
    i += 1
    print(round((i/len(files))*100,2),end="\r"*(i!=len(files)))

print("")
print("Analysis Finished")
print(f"When all words are extracted, we have got a dataset of {len(words_not_darwin)} words in letters TO Darwin")
print(f"When all words are extracted, we have got a dataset of {len(words_darwin)} words in letters FROM Darwin")

.ipynb_checkpoints DCP-LETT-9999.xml
100.0
Analysis Finished
When all words are extracted, we have got a dataset of 593478 words in letters TO Darwin
When all words are extracted, we have got a dataset of 695012 words in letters FROM Darwin


In [8]:

# if "dump.txt" not in os.listdir("."):
#     unique_words = {}
#     for counter, word in enumerate(words):
#         try:
#             unique_words[word] += 1
#         except KeyError:
#             unique_words[word] = 1
#         print(round(((counter+1)/len(words))*100,2),end="\r")

#     sorted_unique_words = {key: value for key, value in sorted(unique_words.items(), key=lambda item: item[1],reverse=True)}

# #     print(list(sorted_unique_words.keys())[:1000])
#     feature_words_unclean = list(sorted_unique_words.keys())[:5000]
#     feature_words = []
#     for word in feature_words_unclean:
#         if word not in stopwords.words():
#             feature_words.append(word)
            
#     with open("dump.txt","w",encoding="utf8") as output:
#         for word in feature_words:
#             try:
#                 output.write(word +"\n")
#             except:
#                 print(word)

#     output.close()

feature_words = file_to_features("dump_not_darwin.txt")
# print(sum(list(sorted_unique_words.values())[:4000]))
# print(sum(list(sorted_unique_words.values())[4000:]))

In [9]:
# test = generate_feature_data(data_set[0]["body"],feature_words)
complete_data,incomplete_data,targets = convert_dictionary_to_dataset_for_gender(not_darwin_dict,feature_words,"sender_bio")

In [16]:
unique_tags = {}
for val in targets:
    try:
        unique_tags[val] += 1
    except:
        unique_tags[val] = 1
        
print(unique_tags)

sorted_unique_words = {key: value for key, value in sorted(unique_tags.items(), key=lambda item: item[1],reverse=True)}

print(list(sorted_unique_words.values())[0]/sum(sorted_unique_words.values()))

{'publisher': 239, 'zoologist': 413, 'traveller': 103, 'botanist': 1306, 'philosopher': 25, 'comparative anatomist': 98, 'naturalist': 358, 'politician': 115, 'mathematician': 146, 'civil servant': 91, 'clergyman': 254, 'agricultural chemist': 10, 'journalist': 16, 'travel writer': 5, 'barrister': 54, 'entomologist': 179, 'printer': 1, 'horticulturist': 15, 'anatomist': 43, 'jurist': 13, 'lawyer': 34, 'surveyor': 21, 'teacher': 9, 'explorer': 24, 'architect': 4, 'businessman': 12, 'naval officer': 73, 'orientalist': 6, 'banker': 183, 'mining engineer': 20, 'physicist': 23, 'author': 185, 'land agent': 12, 'statesman': 17, 'geologist': 214, 'librarian': 6, 'bookseller': 12, 'mineralogist': 10, 'actor': 1, 'embryologist': 13, 'biologist': 3, 'editor': 68, 'gardener': 26, 'palaeontologist': 101, 'chemist': 64, 'nurseryman': 23, 'physiologist': 71, 'schoolteacher': 25, 'divine': 1, 'educationalist': 2, 'anthropologist': 21, 'accountant': 2, 'collector': 2, 'civil engineer': 28, 'poet': 8, 

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    complete_data, targets, test_size=0.4, random_state=42,
)

In [18]:
classifier = KNeighborsClassifier(5,weights="distance")
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)
score = classifier.score(X_train, y_train)
print(score)

0.1817759181775918
0.9953488372093023


In [19]:
predictions = classifier.score(complete_data,targets)
print(predictions)

0.6698288690476191


In [27]:
classifier = DecisionTreeClassifier(max_depth=10)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)


In [35]:
classes = classifier.predict_proba(X_test)
test = classifier.predict(X_test)
print(test[0])
print(y_test[0])
types = classifier.classes_
for row in classes:
    sorted_row = sorted(row,reverse=True)
    for val in sorted_row:
        
    break

botanist
author
['academic' 'actor' 'agricultural chemist' 'agricultural writer'
 'agriculturalist' 'anatomist' 'anglican clergyman' 'anthropologist'
 'archaeologist' 'architect' 'arctic explorer' 'army officer' 'artist'
 'astronomer' 'author' 'banker' 'barrister' 'biologist' 'bookseller'
 'botanical collector' 'botanist' 'businessman' 'chemist' 'civil engineer'
 'civil servant' 'clergyman' 'collector' 'colonial administrator'
 'comparative anatomist' 'curator' 'dentist' 'diplomat' 'divine' 'editor'
 'educationalist' 'educator' 'embryologist' 'engineer' 'engraver'
 'entomologist' 'essayist' 'ethnologist' 'explorer' 'farmer' 'forester'
 'gardener' 'geologist' 'head of state' 'historian' 'horticulturalist'
 'horticulturist' 'illustrator' 'industrialist' 'inventor' 'journalist'
 'judge' 'jurist' 'land agent' 'landowner' 'lawyer' 'lexicographer'
 'librarian' 'magistrate' 'man of letters' 'manufacturer'
 'marine zoologist' 'mathematician' 'merchant' 'meteorologist'
 'military engineer' 'mil

In [21]:
predictions = classifier.predict(incomplete_data)
# print(predictions)

In [22]:
out = classifier.predict(complete_data)
# classifier.predict(test)

In [23]:
# for i,val in enumerate(out):
#     if val != targets[i]:
#         print(val,targets[i],i)
#         print(not_darwin_dict[i]["body"])
#         if val == "M" and targets[i] == "F":
#             print(not_darwin_dict[i]["sender"])
#             print(not_darwin_dict[i]["sender_bio"])
#             tag = not_darwin_dict[i]["sender_bio"][21:-4]
#             try:
#                 print(df[df["id"]=="DCP-IDENT-"+tag]["sex"].iloc[0])
#                 print(df[df["id"]=="DCP-IDENT-"+tag])
#             except:
#                 pass

In [24]:
# print(len(classifier.feature_importances_))

test_words = []
counter = 0
for i,val in enumerate(classifier.feature_importances_):
    if val != 0:
        counter += 1
        print(feature_words[i],val)
        
print(counter)

letter 0.00752737001051071
think 0.005942660534613718
may 0.007757319112782476
two 0.008428306694029936
kind 0.01231528100454832
species 0.00809333768047392
sent 0.013866207914098672
return 0.008319724748459201
whether 0.014277446917911331
year 0.010403596691896678
therefore 0.015285223215650344
back 0.006338837903587963
plant 0.058905973142512905
look 0.010496127697499547
form 0.0039617736897424784
ought 0.0039617736897424784
full 0.007329281326023582
call 0.005942660534613718
try 0.01382218820643487
wd 0.021487506339042374
copies 0.06282819577715223
wh 0.025331424206437417
accept 0.0039617736897424784
month 0.0039617736897424784
honour 0.007420870717775692
cases 0.007801646650569803
thinks 0.007363296554672886
st 0.021391881190524504
delay 0.0039617736897424784
animal 0.016009013921739763
ready 0.006602956149570798
wishes 0.006678418505565892
position 0.0039617736897424784
rest 0.008715902117433456
age 0.027140056114459476
ms 0.011652275558066113
museum 0.009642258024299705
real 0.00

In [25]:
classifier = AdaBoostClassifier()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)
score = classifier.score(X_train, y_train)
print(score)

0.24500232450023246
0.25550387596899227


In [26]:
for i,val in enumerate(classifier.feature_importances_):
    if val != 0:
        print(feature_words[i],val)

copies 0.02
beschäftigt 0.32
honorary 0.32
rostellum 0.34


In [None]:
for i,dictionary in enumerate(not_darwin_dict):
    
    free_text = dictionary["body"]
    
    try:
        reciever_id = dictionary["reciever_bio"]
    except:
        print(dictionary)
        continue
    number_key = reciever_id[21:-4]
    
    dft = df[df["id"]=="DCP-IDENT-"+str(number_key)]
    
    try:
        key_words = dft["keywords"].iloc[0]
    except:
        key_words = "None"
    print(key_words)
    if i == 100:
        break