In [1]:
from bs4 import BeautifulSoup as bs
import lxml
import nltk
import os
import string

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import pandas as pd

df = pd.read_json("people.json")
from nltk.classify.scikitlearn import SklearnClassifier
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import numpy as np
np.random.seed(400)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def extract_info(filepath):
    dictionary_of_interest = {}

    with open(filepath,"r",encoding="utf8") as file:
            
        content = file.readlines()
        content = "".join(content)
        
        bs_content = bs(content, "lxml")

        unique_id = bs_content.find("tei").attrs["xml:id"]

        letter_details = bs_content.find_all("correspaction")

        for deets in letter_details:

            if deets.attrs["type"] == "sent":

                try:
                    dictionary_of_interest["unique_id"] = unique_id
                    dictionary_of_interest["sender"] = deets.persname.text

                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text

                if "when" in list(deets.date.attrs.keys()):
                    dictionary_of_interest["date"] = deets.date.attrs["when"]

                try:
                    dictionary_of_interest["sender_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["sender_bio"] = "None Available"
                    
            if deets.attrs["type"] == "received":
                
                try:
                    dictionary_of_interest["reciever"] = deets.persname.text
                except AttributeError:
                    dictionary_of_interest["reciever"] = deets.orgname.text
                    
                try:
                    dictionary_of_interest["reciever_bio"] = deets.persname.attrs["key"]
                except AttributeError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                except KeyError:
                    dictionary_of_interest["reciever_bio"] = "None Available"
                    
        try:
            free_text = bs_content.find_all("div",{"type":"transcription"})[0].p.text
        except AttributeError:
#             print(bs_content) 
            free_text = ""

        # cleaning of the data
        free_text = free_text.lower().translate(str.maketrans('','',string.punctuation))
        go_away_chars = ['’', '“', '‘', '〈', '〉', '–', '♂', '…', '♀', '〈', '〉', '☿', '§', '⊙', '▵', '∴', '„', '✓']
        for char in go_away_chars:
            free_text = str.replace(free_text, char, "_")
        dictionary_of_interest["body"] = free_text

        file.close()

    return dictionary_of_interest

def generate_feature_data(free_text,feature_set):
    
    feature_bools = []
    
    for word in feature_set:
        feature_bools.append(1*(word in free_text))
        
    return feature_bools

def convert_dictionary_to_dataset_for_gender(data_set,feature_words,sender):
    
    # test = generate_feature_data(data_set[0]["body"],feature_words)
    complete_data = []
    incomplete_data = []
    targets = []
    
    occupations = df["occupation"].unique()
    
    skip_occs = []
    for occ in occupations:
        dft = df[df["occupation"]==occ]
        
        if dft.shape[0] <= 10:
            skip_occs.append(occ)
            
    for dictionary in not_darwin_dict:
        free_text = dictionary["body"]
        try:
            person_id = dictionary[sender]
        except:
            print(dictionary)
            continue
        number_key = person_id[21:-4]

        boolean_set = generate_feature_data(free_text,feature_words)
        try:
            dft = df[df["id"]=="DCP-IDENT-"+str(number_key)]
            gender = dft["occupation"].iloc[0]
    #         if dft["name"].iloc[0] == "John Jenner Weir":
    #             print(gender)
            if gender == "no common occupation" or gender in skip_occs:
                incomplete_data.append(boolean_set)
                continue
    #             gender = "NotAvailable"
    #             print(number_key)
        except:
            continue
    #         gender = "NotAvailable"
    #         print(number_key)
    #         print(reciever_id)
    #         print(df[df["id"]=="DCP-IDENT-"+str(number_key)])

        complete_data.append(boolean_set)
        targets.append(gender)
        
    return complete_data,incomplete_data,targets

def file_to_features(filepath):
    
    with open(filepath, encoding="utf8") as f:
        lines = f.readlines()

    lines_cleaned = []
    for word in lines:
        word = word[:-1]
        if word not in stopwords.words():
            lines_cleaned.append(word)

    feature_words = lines_cleaned
    
    f.close()
    return feature_words

def generate_darwin_or_not_query(dict_list,feature_words):
    
    complete_data = []
    targets = []
    for dict_ in dict_list:
        
        free_text = dict_["body"]
        boolean_set = {word : word in free_text for word in feature_words}
        person_id = dict_["sender_bio"]
        number_key = person_id[21:-4]
        if number_key != "1":
            darwin = "No"
        else:
            darwin = "Yes"
        
        complete_data.append(boolean_set)
        targets.append(darwin)
        
    return complete_data,targets

In [3]:
path = "dcp-data/letters/"
files = os.listdir(path)
print(files[0],files[-1])
# to be commented out depending on who is running the code (lol) 
files = files[1:]
words_darwin = []
words_not_darwin = []

i = 0

not_darwin_dict = []

for file_target in files:
    dict_cur = extract_info(path+file_target)
    cur_words = words_not_darwin
    if "sender" in dict_cur.keys() and dict_cur["sender"] == "Darwin, C. R.":
        cur_words = words_darwin
    else:
        not_darwin_dict.append(dict_cur)
    text_tokens = word_tokenize(dict_cur["body"])

    for word in text_tokens:
        if len(word) == 1 and not(word in ["i","a"]):
            continue
        else:
            cur_words.append(word)
            
    i += 1
    print(round((i/len(files))*100,2),end="\r"*(i!=len(files)))

print("")
print("Analysis Finished")
print(f"When all words are extracted, we have got a dataset of {len(words_not_darwin)} words in letters TO Darwin")
print(f"When all words are extracted, we have got a dataset of {len(words_darwin)} words in letters FROM Darwin")

.ipynb_checkpoints DCP-LETT-9999.xml
100.0
Analysis Finished
When all words are extracted, we have got a dataset of 593478 words in letters TO Darwin
When all words are extracted, we have got a dataset of 695012 words in letters FROM Darwin


In [5]:
unique_words = {}
for counter, word in enumerate(words_not_darwin):
    try:
        unique_words[word] += 1
    except KeyError:
        unique_words[word] = 1
    print(round(((counter+1)/len(words_not_darwin))*100,2),end="\r")

sorted_unique_words = {key: value for key, value in sorted(unique_words.items(), key=lambda item: item[1],reverse=True)}

#     print(list(sorted_unique_words.keys())[:1000])
feature_words_unclean = list(sorted_unique_words.keys())[:1000]
feature_words = []
for word in feature_words_unclean:
    if word not in stopwords.words():
        feature_words.append(word)
# feature_words = file_to_features("dump_not_darwin.txt")
# complete_data,targets = generate_darwin_or_not_query(not_darwin_dict,feature_words)
complete_data,incomplete_data,targets = convert_dictionary_to_dataset_for_gender(not_darwin_dict,feature_words,"sender_bio")

100.0

In [6]:
unique_tags = {}
for val in targets:
    try:
        unique_tags[val] += 1
    except:
        unique_tags[val] = 1
        
print(unique_tags)

sorted_unique_words = {k: value for k, value in sorted(unique_tags.items(), key=lambda item: item[1],reverse=True)}

print(list(sorted_unique_words.values())[0]/sum(sorted_unique_words.values()))

{'publisher': 239, 'zoologist': 413, 'traveller': 103, 'botanist': 1306, 'philosopher': 25, 'naturalist': 358, 'politician': 115, 'mathematician': 146, 'civil servant': 91, 'clergyman': 254, 'journalist': 16, 'barrister': 54, 'entomologist': 179, 'printer': 1, 'anatomist': 43, 'jurist': 13, 'lawyer': 34, 'surveyor': 21, 'teacher': 9, 'explorer': 24, 'architect': 4, 'businessman': 12, 'naval officer': 73, 'orientalist': 6, 'banker': 183, 'mining engineer': 20, 'physicist': 23, 'author': 185, 'statesman': 17, 'geologist': 214, 'librarian': 6, 'bookseller': 12, 'mineralogist': 10, 'embryologist': 13, 'editor': 68, 'gardener': 26, 'palaeontologist': 101, 'chemist': 64, 'nurseryman': 23, 'physiologist': 71, 'schoolteacher': 25, 'anthropologist': 21, 'civil engineer': 28, 'poet': 8, 'army officer': 23, 'diplomat': 30, 'engineer': 11, 'writer': 110, 'pharmacist': 60, 'inventor': 2, 'solicitor': 25, 'administrator': 1, 'missionary': 9, 'merchant': 8, 'social reformer': 6, 'judge': 20, 'novelis

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    complete_data, targets, test_size=0.2, random_state=42, 
)

In [8]:
DTclassifier = SklearnClassifier(DecisionTreeClassifier(max_depth=10))
training_set = []
for i,feature_set in enumerate(X_train):
    temp_dict = {}
    for j,val in enumerate(feature_set):
        temp_dict[feature_words[j]] = val == 1
    training_set.append((temp_dict,y_train[i]))
DTclassifier.train(training_set)

<SklearnClassifier(DecisionTreeClassifier(max_depth=10))>

In [9]:
testing_set = []
for i,feature_set in enumerate(X_test):
    temp_dict = {}
    for j,val in enumerate(feature_set):
        temp_dict[feature_words[j]] = val == 1
    testing_set.append((temp_dict,y_test[i]))

In [10]:
print(nltk.classify.accuracy(DTclassifier,testing_set))

0.26409495548961426


In [12]:
# classifier = nltk.NaiveBayesClassifier.train(training_set)
# print(nltk.classify.accuracy(classifier,testing_set))
print(classifier.show_most_informative_features(1500))

Most Informative Features
               contained = True           whig p : botani =    518.5 : 1.0
                    lord = True           whig p : botani =    222.2 : 1.0
                   human = True           man of : botani =    207.4 : 1.0
                     shd = True            judge : botani =    207.4 : 1.0
                  larger = True           printe : botani =    172.8 : 1.0
                 printed = True           printe : botani =    172.8 : 1.0
                 amongst = True           painte : zoolog =    164.5 : 1.0
                   hopes = True           painte : zoolog =    164.5 : 1.0
                intended = True           printe : zoolog =    164.5 : 1.0
                    feet = True           head o : botani =    155.6 : 1.0
                although = True           physic : botani =    148.1 : 1.0
                    gone = True           painte : natura =    137.5 : 1.0
                    says = True           printe : natura =    137.5 : 1.0

In [None]:
import plotly.graph_objects as go

In [None]:
print(DTclassifier.labels())