# Lexical Richness Index (LRI)

In [2]:
# Import required modules

# Lexical Richness module
# Documentation: https://pypi.org/project/lexicalrichness/
from lexicalrichness import LexicalRichness

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

from collections import Counter

import random

## John Locke

In [3]:
with open("FullText/LockeComplete.txt", "r", encoding="utf-8") as file:
    Locke = file.read()

len(Locke)

2040978

In [4]:
# LRI of full text
locke = LexicalRichness(Locke)

print("Lexical Richness of John Locke")

# Return (unique) word count
print("Unique Word Count: %s" % locke.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % locke.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % locke.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % locke.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % locke.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % locke.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % locke.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % locke.hdd(draws=42))

Lexical Richness of John Locke
Unique Word Count: 8800
Type Token Ratio: 0.022749305241388226
Root Type Token Ratio: 14.148988872856478
Corrected Type Token Ratio: 10.00484597892982
Mean Segmental Type Token Ratio: 0.8591416752843168
Moving Average Type Token Ratio: 0.8593303533353228
Measure of Textual Lexical Diversity: 56.95987689180785
Hypergeometric Distribution Diversity: 0.8255656836997587


### LRI Mean Average Function

In [5]:
# For accurate comparison, Jockers recommends comparing random 10,000 word chunks of each corpus
# Use without stopwords as all vocabulary matters here
# Build a function to select 10,000 random words and find mean average of multiple LRIs
def LRI (times, text):
    
    # Empty variables for LRI mean averages
    UWQavg = []
    TTRavg = []
    RTTRavg = []
    CTTRavg = []
    MSTTRavg = []
    MATTRavg = []
    MTLDavg = []
    HDDavg = []
    
    # Tokenize text for randomization with NLTK
    textToke = nltk.word_tokenize(text)
    
    # Iterate through the function multiple times
    for i in range(times):
        
        # Pick 10000 random words via RANDOM
        textRand = random.sample(textToke, 10000)
    
        # Convert back to string
        textStr = ' '.join(textRand)
    
        # Perform LRI with LexicalRichness
        textLRI = LexicalRichness(textStr)
    
        # Unique Word Count
        UWQ = textLRI.terms
        # Type Token Ratio 
        TTR = textLRI.ttr
        # Root Type Token Ratio 
        RTTR = textLRI.rttr
        # Corrected Type Token Ratio 
        CTTR = textLRI.cttr
        # Mean Segmental Type Token Ratio 
        MSTTR = textLRI.msttr(segment_window=25)
        # Return Moving Average Type Token Ratio (MATTR) of text
        MATTR = textLRI.mattr(window_size=25)
        # Measure of Textual Lexical Diversity 
        MTLD = textLRI.mtld(threshold=0.72)
        # Hypergeometric Distribution Diversity measure
        HDD = textLRI.hdd(draws=42)
        
        # Append results for mean average
        UWQavg.append(UWQ)
        TTRavg.append(TTR)
        RTTRavg.append(RTTR)
        CTTRavg.append(CTTR)
        MSTTRavg.append(MSTTR)
        MATTRavg.append(MATTR)
        MTLDavg.append(MTLD)
        HDDavg.append(HDD)
                                         # Average results, round to 2 decimal places
    print("Unique Word Count: %s" % round(sum(UWQavg)/len(UWQavg), 2))
    print("Type Token Ratio: %s" % round(sum(TTRavg)/len(TTRavg), 2))
    print("Root Type Token Ratio: %s" % round(sum(RTTRavg)/len(RTTRavg), 2))
    print("Corrected Type Token Ratio: %s" % round(sum(CTTRavg)/len(CTTRavg), 2))
    print("Mean Segmental Type Token Ratio: %s" % round(sum(MSTTRavg)/len(MSTTRavg), 2))
    print("Moving Average Type Token Ratio: %s" % round(sum(MATTRavg)/len(MATTRavg), 2))
    print("Measure of Textual Lexical Diversity: %s" % round(sum(MTLDavg)/len(MTLDavg), 2))
    print("Hypergeometric Distribution Diversity: %s" % round(sum(HDDavg)/len(HDDavg), 2))
    return;

# Iterate through function multiple times and average results
LRI(10, Locke)

Unique Word Count: 1615.1
Type Token Ratio: 0.16
Root Type Token Ratio: 16.15
Corrected Type Token Ratio: 11.42
Mean Segmental Type Token Ratio: 0.88
Moving Average Type Token Ratio: 0.88
Measure of Textual Lexical Diversity: 85.28
Hypergeometric Distribution Diversity: 0.83


### Word Frequency and Count

In [6]:
#Write out new file with stopwords removed for each text file.

stop_words = set(stopwords.words('english')) 

print(stop_words)

{'because', 'more', 'our', 'but', "didn't", 'which', 'does', "mustn't", 'will', 'needn', 'very', 'are', 'on', 'd', 'its', 'hers', 'ourselves', 'in', 'have', 'after', 'it', 'his', 'an', 'a', 'through', 'than', 're', 'won', 'doing', 'haven', 'himself', "wouldn't", 'whom', 'couldn', "won't", "should've", "wasn't", 'having', "hasn't", 'out', 'can', "weren't", 'he', 'doesn', 't', 'ma', 'yourself', "aren't", "mightn't", 'against', 'once', 'why', 'until', 'y', 'and', 'has', 'had', "that'll", 'some', 'been', "it's", 'should', 'didn', 'him', 'shouldn', 'mightn', 'as', 've', 'themselves', "couldn't", 'this', 'for', 'while', 'wasn', "don't", 'here', 'is', 'how', 'these', 'who', 'do', 'her', 'off', 'to', "haven't", 'or', 'be', 's', 'other', 'was', 'their', "shouldn't", 'into', 'o', 'not', 'just', 'what', 'most', 'too', 'during', 'all', 'i', 'them', 'about', 'hadn', 'shan', 'wouldn', 'your', 'up', 'we', 'that', 'above', 'those', "doesn't", 'where', 'both', 'then', 'again', 'down', 'herself', "you'r

In [7]:
#Build stopword files

file = open("Clean/Locke_HumanUnderstandingCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Locke_HumanUnderstandingSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Clean/Locke_TwoTreatisesCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Locke_TwoTreatisesSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("FullText/LockeComplete.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/LockeCompleteSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 

In [8]:
# Open .txt files with stopwords removed
with open("Stopword/Locke_HumanUnderstandingSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeHum = file.read()
    
with open("Stopword/Locke_TwoTreatisesSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeTwo = file.read()
    
with open("Stopword/LockeCompleteSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeCom = file.read()

In [9]:
# Tokenize, Word Frequency and Count function
def WordFreq (text):
    textToke = nltk.word_tokenize(text)
    textWord = nltk.Text(textToke)
    textFreq = nltk.FreqDist(textWord)
    textCount = Counter(textFreq)
    print(Counter(textCount).most_common(10))
    return;

WordFreq(LockeHum)
WordFreq(LockeTwo)
WordFreq(LockeCom)

[('ideas', 2633), ('one', 1655), ('make', 1596), ('idea', 1426), ('mind', 1350), ('think', 1172), ('name', 1014), ('man', 966), ('may', 963), ('knowledge', 869)]
[('power', 793), ('right', 587), ('one', 534), ('make', 494), ('man', 367), ('give', 364), ('father', 345), ('government', 339), ('men', 330), ('may', 324)]
[('ideas', 2634), ('one', 2189), ('make', 2090), ('idea', 1426), ('mind', 1364), ('think', 1336), ('man', 1333), ('power', 1326), ('may', 1287), ('men', 1147)]


## George Berkeley

In [10]:
with open("FullText/BerkeleyComplete.txt", "r", encoding="utf-8") as file:
    Berkeley = file.read()

len(Berkeley)

986707

In [11]:
berkeley = LexicalRichness(Berkeley)

# Return word count
print("Lexical Richness of George Berkeley")

# Return (unique) word count
print("Unique Word Count: %s" % berkeley.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % berkeley.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % berkeley.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % berkeley.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % berkeley.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % berkeley.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % berkeley.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % berkeley.hdd(draws=42))

Lexical Richness of George Berkeley
Unique Word Count: 6972
Type Token Ratio: 0.03804368585038988
Root Type Token Ratio: 16.286208206605927
Corrected Type Token Ratio: 11.516088262707052
Mean Segmental Type Token Ratio: 0.8706630286494635
Moving Average Type Token Ratio: 0.8704321678248385
Measure of Textual Lexical Diversity: 67.08905926109193
Hypergeometric Distribution Diversity: 0.8395401378169307


In [12]:
# LRI Mean Average Function
LRI(10, Berkeley)

Unique Word Count: 1800.0
Type Token Ratio: 0.18
Root Type Token Ratio: 18.0
Corrected Type Token Ratio: 12.73
Mean Segmental Type Token Ratio: 0.89
Moving Average Type Token Ratio: 0.89
Measure of Textual Lexical Diversity: 102.64
Hypergeometric Distribution Diversity: 0.84


### Word Frequency and Count

In [13]:
file = open("Clean/Berkeley_AlciphronCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Berkeley_AlciphronSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Clean/Berkeley_HumanKnowledgeCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Berkeley_HumanKnowledgeSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Clean/Berkeley_TheoryOfVisionCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Berkeley_TheoryOfVisionSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Clean/Berkeley_ThreeDialoguesCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Berkeley_ThreeDialoguesSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("FullText/BerkeleyComplete.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/BerkeleyCompleteSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 

In [14]:
# Open .txt files with stopwords removed
with open("Stopword/Berkeley_AlciphronSTOPWORDS.txt", "r") as file:
    BerkeleyAlc = file.read()
    
with open("Stopword/Berkeley_HumanKnowledgeSTOPWORDS.txt", "r") as file:
    BerkeleyHum = file.read()
    
with open("Stopword/Berkeley_TheoryOfVisionSTOPWORDS.txt", "r") as file:
    BerkeleyThe = file.read()
    
with open("Stopword/Berkeley_ThreeDialoguesSTOPWORDS.txt", "r") as file:
    BerkeleyThr = file.read()
    
with open("Stopword/BerkeleyCompleteSTOPWORDS.txt", "r") as file:
    BerkeleyCom = file.read()

In [15]:
# Tokenize, Word Frequency and Count 
WordFreq(BerkeleyAlc)
WordFreq(BerkeleyHum)
WordFreq(BerkeleyThe)
WordFreq(BerkeleyThr)
WordFreq(BerkeleyCom)

[('think', 412), ('men', 404), ('say', 326), ('man', 320), ('things', 268), ('make', 266), ('one', 255), ('god', 248), ('see', 211), ('religion', 210)]
[('ideas', 262), ('mind', 225), ('may', 187), ('perceive', 154), ('things', 153), ('idea', 148), ('say', 142), ('exist', 139), ('sense', 137), ('think', 128)]
[('object', 292), ('distance', 210), ('visible', 174), ('sight', 159), ('eye', 148), ('perceive', 134), ('ideas', 130), ('tangible', 130), ('one', 122), ('see', 115)]
[('perceive', 263), ('things', 262), ('mind', 246), ('exist', 209), ('sense', 199), ('think', 192), ('ideas', 179), ('dont', 168), ('say', 161), ('know', 156)]
[('think', 814), ('mind', 769), ('things', 760), ('say', 697), ('ideas', 669), ('perceive', 599), ('sense', 581), ('one', 578), ('make', 562), ('object', 544)]


## David Hume

In [16]:
with open("FullText/HumeComplete.txt", "r", encoding="utf-8") as file:
    Hume = file.read()

len(Hume)

2094204

In [17]:
hume = LexicalRichness(Hume)

# Return word count
print("Lexical Richness of David Hume")

# Return (unique) word count
print("Unique Word Count: %s" % hume.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % hume.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % hume.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % hume.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % hume.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % hume.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % hume.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % hume.hdd(draws=42))

Lexical Richness of David Hume
Unique Word Count: 14970
Type Token Ratio: 0.040178535482987866
Root Type Token Ratio: 24.524939881278573
Corrected Type Token Ratio: 17.341751298244482
Mean Segmental Type Token Ratio: 0.8736012883311353
Moving Average Type Token Ratio: 0.8737627730088239
Measure of Textual Lexical Diversity: 74.82424024214168
Hypergeometric Distribution Diversity: 0.8383810802901128


In [18]:
# LRI Mean Average Function
LRI(10, Hume)

Unique Word Count: 2366.8
Type Token Ratio: 0.24
Root Type Token Ratio: 23.67
Corrected Type Token Ratio: 16.74
Mean Segmental Type Token Ratio: 0.89
Moving Average Type Token Ratio: 0.89
Measure of Textual Lexical Diversity: 109.77
Hypergeometric Distribution Diversity: 0.84


### Word Frequency and Count

In [19]:
file = open("Clean/Hume_EssaysMoralPoliticalLiteraryCLEAN.txt", encoding="utf-8") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Hume_EssaysMoralPoliticalLiterarySTOPWORDS.txt','a', encoding="utf-8") 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Clean/Hume_HumanUnderstandingCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Hume_HumanUnderstandingSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Clean/Hume_NaturalReligionCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Hume_NaturalReligionSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Clean/Hume_SourcesofMoralsCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/Hume_SourcesofMoralsSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("FullText/HumeComplete.txt", encoding="utf-8") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Stopword/HumeCompleteSTOPWORDS.txt','a', encoding="utf-8") 
        appendFile.write(" "+r) 
        appendFile.close() 

In [20]:
# Open .txt files with stopwords removed
with open("Stopword/Hume_EssaysMoralPoliticalLiterarySTOPWORDS.txt", "r", encoding="utf-8") as file:
    HumeEss = file.read()
    
with open("Stopword/Hume_HumanUnderstandingSTOPWORDS.txt", "r") as file:
    HumeHum = file.read()
    
with open("Stopword/Hume_NaturalReligionSTOPWORDS.txt", "r") as file:
    HumeNat = file.read()
    
with open("Stopword/Hume_SourcesofMoralsSTOPWORDS.txt", "r") as file:
    HumeSou = file.read()
    
with open("Stopword/HumeCompleteSTOPWORDS.txt", "r", encoding="utf-8") as file:
    HumeCom = file.read()

In [21]:
# Tokenize, Word Frequency and Count 
WordFreq(HumeEss)
WordFreq(HumeHum)
WordFreq(HumeNat)
WordFreq(HumeSou)
WordFreq(HumeCom)

[('may', 823), ('one', 649), ('every', 576), ('great', 563), ('must', 528), ('would', 509), ('make', 465), ('men', 452), ('much', 441), ('government', 437)]
[('may', 274), ('reason', 252), ('cause', 202), ('object', 197), ('one', 190), ('nature', 184), ('effect', 179), ('experience', 162), ('us', 161), ('must', 156)]
[('cause', 167), ('reason', 160), ('human', 138), ('one', 128), ('say', 117), ('think', 116), ('world', 115), ('nature', 114), ('would', 106), ('god', 103)]
[('us', 165), ('would', 152), ('one', 151), ('man', 143), ('make', 132), ('society', 131), ('reason', 127), ('think', 121), ('human', 120), ('justice', 116)]
[('may', 1241), ('one', 1118), ('reason', 893), ('every', 875), ('would', 860), ('must', 838), ('make', 767), ('us', 744), ('great', 705), ('nature', 691)]
