# Exploratory Text Analytics Final Project:
### Christian Literature from Protestant Reformation to the Present

* John Hazelton (Jch5nb@virginia.edu) 
* DS 5001
* December 17, 2021

## Data Organization into Digital Analytical Edition of Corpus

## Imports

In [None]:
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.decomposition import PCA
from scipy.linalg import norm
from scipy.linalg import eigh
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.manifold import TSNE
from gensim.models import word2vec

import plotly.express as px
import plotly.io as pio
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style='ticks')
pio.renderers.default = 'notebook_connected'

### Download NLTK resources

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('tagsets')

## Setup

In [60]:
OHCO = ['book_id', 'book_num', 'chap_num', 'para_num', 'sent_num', 'token_num']
#OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num'] #excluding book level
corpus_dir = 'corpus'

In [61]:
%matplotlib inline

## Inspect
Our corpus of texts vary widely in their formatting, so we manually define their chunking patterns below.

In [67]:
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
chap_pats = {
    1: { 
        'start_line': 1269,
        'end_line': 19759,
        'book':  re.compile('^CALVIN\'S LETTERS\.'),
        'chapter': re.compile('^'+roman+'\.\s+\-\s+\-\s+.*$'.format(roman))
    },
    2: { # (weird every para starts w/ number)
        'start_line': 1903,
        'end_line': 61563,
        'book':  re.compile('^BOOK\s+[\S]+\.'.format(roman)),
        'chapter': re.compile('^\s*CHAPTER\s+\d*\.'.format(roman))    
    },
    3: { # (Used 'FIRST STAGE' to split by book, since both begin w/ a 1st stage)
        'start_line': 638,
        'end_line': 12175,
        #'book':  re.compile('^\s+PART\s+'+roman.format(roman)),
        'book':  re.compile('^THE FIRST STAGE'),
        'chapter': re.compile('^THE\s+[\S]+\s+STAGE')
    },
    4: { # (no chapters)
        #'start_line': 345, #USE THIS IF EXCLUDING TITLE
        'start_line': 342,
        'end_line': 9169,
        'book': re.compile('^A RELATION OF THE HOLY WAR\.'.format(roman)),
        'chapter': re.compile('^A RELATION OF THE HOLY WAR\.'.format(roman))
    },
    5: { # (no chapters)
        'start_line': 259,
        'end_line': 14282,
        'book': re.compile('^RELIGIOUS AFFECTIONS\.'),
        'chapter': re.compile('^RELIGIOUS AFFECTIONS\.')
    },
    6: {
        'start_line': 139,
        'end_line': 17013,
        'book':  re.compile('^SERMON 1: .*$'),
        'chapter': re.compile('^SERMON [0-9]+:')
    },
    7: {
        'start_line': 126,
        'end_line': 6682,
        'book': re.compile('^CHAPTER I\.'),
        'chapter': re.compile('^\s*CHAPTER '+roman+'\.'.format(roman))
    },
    8: {
        'start_line': 348,
        'end_line': 4483,
        'book': re.compile('CHAPTER I\.'),
        'chapter': re.compile('^CHAPTER '+roman+'\.'.format(roman))
    },
    9: { 
        'start_line': 144,
        'end_line': 584,
        'book': re.compile('^I. INTRODUCTORY'),
        'chapter': re.compile('^'+roman+'\.\s+.+$'.format(roman))
    },
    10: {
        'start_line': 166,
        'end_line': 782,
        'book':  re.compile('^CHAPTER I$'),
        'chapter': re.compile('^CHAPTER\s+'+roman.format(roman))
    },   
    11: {
        'start_line': 291,
        'end_line': 2312,
        'book':  re.compile('^1$'),
        'chapter': re.compile('^[0-9]+$')
    },
    12: {
        'start_line': 383,
        'end_line': 2117,
        'book':  re.compile('^1: TO ABSORB THE WRATH OF GOD'),
        'chapter': re.compile('^[0-9]+:\s+.+$')
    }
}

# Register and Chunk

In [68]:
def acquire_corpus(corpus_list, chap_pats, OHCO=OHCO):
    
    my_lib = []
    my_doc = []

    for text_file in corpus_list:
        
        # Get ID from filename:
        book_id = int(text_file.split('_')[0].split('\\')[-1])
        print("BOOK ID", book_id)
        
        # Import file as lines
        lines = open(text_file, 'r', encoding='utf-8-sig').readlines()
        df = pd.DataFrame(lines, columns=['line_str'])
        df.index.name = 'line_num'
        df.line_str = df.line_str.str.strip()
        df['book_id'] = book_id
        
        # FIX CHARACTERS TO IMPROVE TOKENIZATION
        df.line_str = df.line_str.str.replace('—', ' — ')
        df.line_str = df.line_str.str.replace('-', ' - ')
        
        # Get book title, author, and year and put into LIB table (using filenames that I pre-filled with the metadata)
        book = text_file.split('_')[3].replace('-', ' ').replace('.txt', '')
        author = text_file.split('_')[1]
        year = text_file.split('_')[2]
        
        # Remove cruft
        a = chap_pats[book_id]['start_line'] - 1
        b = chap_pats[book_id]['end_line'] + 1
        df = df.iloc[a:b]
        
        # Chunk by book
        book_lines = df.line_str.str.match(chap_pats[book_id]['book'])
        book_nums = [i+1 for i in range(df.loc[book_lines].shape[0])]
        df.loc[book_lines, 'book_num'] = book_nums
        df.book_num = df.book_num.ffill()

        # Chunk by chapter
        chap_lines = df.line_str.str.match(chap_pats[book_id]['chapter'])
        chap_nums = [i+1 for i in range(df.loc[chap_lines].shape[0])]
        df.loc[chap_lines, 'chap_num'] = chap_nums
        df.chap_num = df.chap_num.ffill()

        # Clean up
        df = df[~df.chap_num.isna()] # Remove chapter heading lines
        #df = df[~df.book_num.isna()] # Remove book heading lines
        df = df.loc[~chap_lines] ## Remove everything before Chapter 1 - Can edit this to remove everything before book 1
        df['chap_num'] = df['chap_num'].astype('int')
        #df['book_num'] = df['book_num'].fillna(0)
        df['book_num'] = df['book_num'].astype('int')  ## Remove this line if only doing chapter-level

        ### Important for book v chapter level breakdown: 
        df = df.groupby(OHCO[1:3]).line_str.apply(lambda x: '\n'.join(x)).to_frame() # Make big string - edit subscripts to include/exclude book level
        
        # Split into paragraphs
        df = df['line_str'].str.split(r'\n\n+', expand=True).stack().to_frame().rename(columns={0:'para_str'})
        df.index.names = OHCO[1:4] ## Edit subscript limits on this for chunking by book vs chapter level
        df['para_str'] = df['para_str'].str.replace(r'\n', ' ').str.strip()
        df = df[~df['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs
        
        # Set index
        df['book_id'] = book_id
        df = df.reset_index().set_index(OHCO[:4])

        # Register
        my_lib.append((book_id, book, author, year, text_file))
        my_doc.append(df)

    docs = pd.concat(my_doc)
    library = pd.DataFrame(my_lib, columns=['book_id', 'book', 'author', 'year', 'book_file']).set_index('book_id')
    print("Done.")
    return library, docs

In [69]:
corpus = [text for text in sorted(glob(corpus_dir+'/*.txt'))]
LIB, DOC = acquire_corpus(corpus, chap_pats)

BOOK ID 10
BOOK ID 11
BOOK ID 12
BOOK ID 1
BOOK ID 2



The default value of regex will change from True to False in a future version.



BOOK ID 3
BOOK ID 4
BOOK ID 5
BOOK ID 6
BOOK ID 7
BOOK ID 8
BOOK ID 9
Done.


In [70]:
LIB

Unnamed: 0_level_0,book,author,year,book_file
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,The Four Loves,Lewis,1960,corpus\10_Lewis_1960_The-Four-Loves.txt
11,Don't Waste Your Life,Piper,2003,corpus\11_Piper_2003_Don't-Waste-Your-Life.txt
12,Fifty Reasons Why Jesus Came to Die,Piper,2004,corpus\12_Piper_2004_Fifty-Reasons-Why-Jesus-C...
1,"Letters of John Calvin, Volume I",Calvin,1536,"corpus\1_Calvin_1536_Letters-of-John-Calvin,-V..."
2,The Institutes of the Christian Religion,Calvin,1541,corpus\2_Calvin_1541_The-Institutes-of-the-Chr...
3,The Pilgrim's Progress,Bunyan,1678,corpus\3_Bunyan_1678_The-Pilgrim's-Progress.txt
4,The Holy War,Bunyan,1682,corpus\4_Bunyan_1682_The-Holy-War.txt
5,Religious Affections,Edwards,1746,corpus\5_Edwards_1746_Religious-Affections.txt
6,Select Sermons,Edwards,1750,corpus\6_Edwards_1750_Select-Sermons.txt
7,Evidences of the Christian Religion,Alexander,1832,corpus\7_Alexander_1832_Evidences-of-the-Chris...


In [71]:
DOC.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,para_str
book_id,book_num,chap_num,para_num,Unnamed: 4_level_1
6,1,12,54,"Many, when they think they are converted, seem..."
9,1,3,13,"A nobler analogy, sanctioned by the constant t..."
1,1,39,9,"[_Calvin's Lat. Corresp._, Opera, tom. ix. p. ..."
6,1,13,5,"And thus the apostle proves, that no flesh can..."
2,2,34,53,[262] Ephes. 1:20; Phil. 2:9; 1 Cor. 15:27; Ep...
1,1,94,15,"Adieu, my excellent and highly esteemed brothe..."
11,1,2,58,So here is the question to test whether you ha...
6,1,13,19,And there are actual wickednesses without numb...
6,1,16,29,How happy would you be if your hearts were but...
12,1,26,10,"For Christ has entered... into heaven itself, ..."


# Tokenize and Annotate
Here we use NLTK functions to tokenize & annotate our dataframe.

In [72]:
def tokenize(doc_df, OHCO=OHCO, remove_pos_tuple=False, ws=False):
    
    # Paragraphs to Sentences
    df = doc_df.para_str\
        .apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
        .stack()\
        .to_frame()\
        .rename(columns={0:'sent_str'})
    
    # Sentences to Tokens
    # Local function to pick tokenizer
    def word_tokenize(x):
        if ws:
            s = pd.Series(nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)))
        else:
            s = pd.Series(nltk.pos_tag(nltk.word_tokenize(x))) # Discards stuff in between
        return s
            
    df = df.sent_str\
        .apply(word_tokenize)\
        .stack()\
        .to_frame()\
        .rename(columns={0:'pos_tuple'})
    
    # Grab info from tuple
    df['pos'] = df.pos_tuple.apply(lambda x: x[1])
    df['token_str'] = df.pos_tuple.apply(lambda x: x[0])
    if remove_pos_tuple:
        df = df.drop('pos_tuple', 1)
    
    # Add index
    df.index.names = OHCO
    
    return df

In [102]:
#%%time #show runtime
TOKEN = tokenize(DOC, ws=False)

In [103]:
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,pos_tuple,pos,token_str
book_id,book_num,chap_num,para_num,sent_num,token_num,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10,1,1,1,0,0,"(Introduction, NN)",NN,Introduction
10,1,1,2,0,0,"(``, ``)",``,``
10,1,1,2,0,1,"(God, NNP)",NNP,God
10,1,1,2,0,2,"(is, VBZ)",VBZ,is
10,1,1,2,0,3,"(love, RBR)",RBR,love


# Reduce

Extract a vocabulary from the TOKEN table

In [104]:
TOKEN['term_str'] = TOKEN['token_str'].str.lower().str.replace('[\W_]', '')


The default value of regex will change from True to False in a future version.



In [105]:
VOCAB = TOKEN.term_str.value_counts().to_frame()\
    .rename(columns={'index':'term_str', 'term_str':'n'})\
    .sort_index().reset_index().rename(columns={'index':'term_str'})
VOCAB.index.name = 'term_id'

In [106]:
VOCAB['num'] = VOCAB.term_str.str.match("\d+").astype('int')

In [107]:
VOCAB.head()

Unnamed: 0_level_0,term_str,n,num
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,,288641,0
1,0.0,1,1
2,1.0,1,1
3,1.0,1566,1
4,10.0,256,1


## Annotate VOCAB

### Add Stopwords

We are using NLTK's built in stopword list for English. We also add a few of our own stopwords - these include repeated names & locations mentioned in headings & footings of John Calvin's letters, etc. We can always further add/subtract from this list as we see fit.

In [115]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])

# Add our own stopwords:
stop_list = {'term_str': ['geneva', 'charles', 'francis', 'calvin', 'strasbourg', 'france', 'caroli']}
sw = sw.append(pd.DataFrame(stop_list))

sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

In [117]:
sw

Unnamed: 0_level_0,dummy
term_str,Unnamed: 1_level_1
i,1
me,1
my,1
myself,1
we,1
...,...
francis,1
calvin,1
strasbourg,1
france,1


In [118]:
VOCAB['stop'] = VOCAB.term_str.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [122]:
VOCAB[VOCAB.stop == 1].sample(5)

Unnamed: 0_level_0,term_str,n,num,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12209,francis,62,0,1
13677,his,14732,0,1
26412,them,7846,0,1
26419,then,2908,0,1
27517,under,1551,0,1


### Remove Stopwords & Numbers from VOCAB

In [123]:
VOCAB = VOCAB[VOCAB.stop == 0]
VOCAB = VOCAB[VOCAB.num == 0]
VOCAB = VOCAB[VOCAB.term_str != ''] # remove empty string term

In [124]:
VOCAB.sort_values(by='n', ascending=False).head(5) 

Unnamed: 0_level_0,term_str,n,num,stop
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12782,god,13132,0,0
28025,us,5896,0,0
5965,christ,5738,0,0
17181,may,4635,0,0
18869,one,4431,0,0


### Add (Porter) Stems

In [125]:
#Add Porter stems using the PorterStemmer module:
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.term_str.apply(stemmer1.stem)

In [126]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_str,n,num,stop,stem_porter
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9756,dulled,1,0,0,dull
18692,obstruction,3,0,0,obstruct
26237,tempations,1,0,0,tempat
28036,uselessly,2,0,0,uselessli
17455,metrical,1,0,0,metric
22836,revenger,1,0,0,reveng
25280,stirs,8,0,0,stir
25725,summer,18,0,0,summer
8453,depicted,4,0,0,depict
7046,consul,5,0,0,consul


### Add term_id to TOKEN table

We need to do this to combine the VOCAB and TOKEN tables more efficiently.
We use `.map()` because TOKEN and VOCAB do not share an index at this time.

In [127]:
VOCAB = VOCAB[~VOCAB.term_str.isna()]
VOCAB = VOCAB[VOCAB.term_str != '']
TOKEN = TOKEN[~TOKEN.term_str.isna()]
TOKEN = TOKEN[TOKEN.term_str != '']

In [128]:
TOKEN['term_id'] = TOKEN.term_str.map(VOCAB.reset_index().set_index('term_str').term_id)

In [129]:
TOKEN.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,pos_tuple,pos,token_str,term_str,term_id
book_id,book_num,chap_num,para_num,sent_num,token_num,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10,1,1,1,0,0,"(Introduction, NN)",NN,Introduction,introduction,15236.0
10,1,1,2,0,1,"(God, NNP)",NNP,God,god,12782.0
10,1,1,2,0,2,"(is, VBZ)",VBZ,is,is,
10,1,1,2,0,3,"(love, RBR)",RBR,love,love,16648.0
10,1,1,2,0,6,"(says, VBZ)",VBZ,says,says,23510.0
10,1,1,2,0,7,"(St., NNP)",NNP,St.,st,25076.0
10,1,1,2,0,8,"(John, NNP)",NNP,John,john,15574.0
10,1,1,2,1,0,"(When, WRB)",WRB,When,when,
10,1,1,2,1,1,"(I, PRP)",PRP,I,i,
10,1,1,2,1,2,"(first, RB)",RB,first,first,11752.0


### Add Most Frequently Associated POS for each Term in VOCAB:

In [130]:
VOCAB['pos_max'] = TOKEN.groupby(['term_id', 'pos']).pos.count().unstack().idxmax(1)

In [131]:
VOCAB.sample(10)

Unnamed: 0_level_0,term_str,n,num,stop,stem_porter,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28643,wakes,2,0,0,wake,NNS
20075,pia,1,0,0,pia,VBP
17718,mocked,20,0,0,mock,VBN
27740,unless,633,0,0,unless,IN
6157,clears,6,0,0,clear,VBZ
8478,deprecated,2,0,0,deprec,VBD
14216,immorality,2,0,0,immor,NN
23232,saccharine,1,0,0,saccharin,JJ
24878,spatial,1,0,0,spatial,JJ
10417,enslaving,1,0,0,enslav,VBG


# Setup for BOW & TFIDF

In [132]:
count_method = 'n' # 'c' or 'n' # n = n tokens, c = distinct token (term) count
tf_method = 'sum' # sum, max, log, double_norm, raw, binary
tf_norm_k = .5 # only used for double_norm
idf_method = 'standard' # standard, max, smooth
gradient_cmap = 'YlGnBu' # YlGn, GnBu, YlGnBu; For tables; see https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html 

OHCO = ['book_id', 'book_num', 'chap_num', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:5]
PARAS = OHCO[:4]
CHAPS = OHCO[:3]
BOOKS = OHCO[:1]

### Add Term Rank to VOCAB

In [133]:
if 'term_rank' not in VOCAB.columns:
    VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
    VOCAB.index.name = 'term_rank'
    VOCAB = VOCAB.reset_index()
    VOCAB = VOCAB.set_index('term_id')
    VOCAB['term_rank'] = VOCAB['term_rank'] + 1

### Alternate Rank
The `term_rank` above^ assigns different ranks to words w/ the same frequency, which occurs in the long tail, e.g. with words that appear once.
We will now add a `term_rank2` that groups words by term count.

In [134]:
new_rank = VOCAB.n.value_counts()\
    .sort_index(ascending=False).reset_index().reset_index()\
    .rename(columns={'level_0':'term_rank2', 'index':'n', 'n':'nn'})\
    .set_index('n')

VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1
VOCAB['p'] = VOCAB.n / VOCAB.shape[0]

In [135]:
VOCAB.sample(5)

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,stem_porter,pos_max,term_rank2,p
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3956,15915,avows,2,0,0,avow,NNS,620,7.3e-05
19630,11155,peaceably,4,0,0,peaceabl,RB,618,0.000147
23639,94,scripture,1025,0,0,scriptur,NNP,90,0.03755
16126,606,learn,222,0,0,learn,VB,401,0.008133
21696,25018,quæs,1,0,0,quæ,NNP,621,3.7e-05


### Compute Zipf's K

Zipf's Law:

$f \propto \frac{1}{r} $

$k =  fr$

In [136]:
VOCAB['zipf_k'] = VOCAB.n * VOCAB.term_rank
VOCAB['zipf_k2'] = VOCAB.n * VOCAB.term_rank2
VOCAB['zipf_k3'] = VOCAB.p * VOCAB.term_rank2

### VOCAB Entropy
Compute P of VOCAB - This is the prior, or marginal, probability of a term.

In [137]:
VOCAB['p2'] = VOCAB.n / VOCAB.n.sum()

### Compute Entropy of VOCAB

In [138]:
VOCAB['h'] = VOCAB.p2 * np.log2(1/VOCAB.p2) # Self entropy of each word 
H = VOCAB.h.sum()
N_v = VOCAB.shape[0]
H_max = np.log2(N_v)
R = round(1 - (H/H_max), 2) * 100

print("H \t= {}\nH_max \t= {}\nR \t= {}%".format(H, H_max, int(R)))

H 	= 11.714866997124313
H_max 	= 14.736454784066483
R 	= 21%


In [139]:
VOCAB.sample(5)

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,stem_porter,pos_max,term_rank2,p,zipf_k,zipf_k2,zipf_k3,p2,h
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
8164,2394,deeply,54,0,0,deepli,RB,568,0.001978,129276,30672,1.12364,7.2e-05,0.000989
17201,10365,meals,5,0,0,meal,NNS,617,0.000183,51825,3085,0.113016,7e-06,0.000114
21484,13419,purposeful,3,0,0,purpos,JJ,619,0.00011,40257,1857,0.068029,4e-06,7.2e-05
17831,24199,montaigne,1,0,0,montaign,NNP,621,3.7e-05,24199,621,0.02275,1e-06,2.6e-05
14536,25896,incontestable,1,0,0,incontest,JJ,621,3.7e-05,25896,621,0.02275,1e-06,2.6e-05


In [212]:
VOCAB.sort_values(by='term_rank2', ascending=True).head(20)

Unnamed: 0_level_0,term_rank,term_str,n,num,stop,stem_porter,pos_max,term_rank2,p,zipf_k,zipf_k2,zipf_k3,p2,h
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
12782,1,god,13132,0,0,god,NNP,1,0.481079,13132,13132,0.481079,0.017468,0.101998
28025,2,us,5896,0,0,us,PRP,2,0.215994,11792,11792,0.431989,0.007843,0.054856
5965,3,christ,5738,0,0,christ,NNP,3,0.210206,17214,17214,0.630619,0.007633,0.053685
17181,4,may,4635,0,0,may,MD,4,0.169799,18540,18540,0.679196,0.006165,0.045264
18869,5,one,4431,0,0,one,CD,5,0.162326,22155,22155,0.811628,0.005894,0.043655
29227,6,would,4184,0,0,would,MD,6,0.153277,25104,25104,0.919662,0.005566,0.041682
16934,7,man,3705,0,0,man,NN,7,0.135729,25935,25935,0.950104,0.004928,0.037774
16610,8,lord,3330,0,0,lord,NNP,8,0.121991,26640,26640,0.975931,0.00443,0.034633
17329,9,men,3037,0,0,men,NNS,9,0.111258,27333,27333,1.001319,0.00404,0.032122
24044,10,shall,2950,0,0,shall,MD,10,0.10807,29500,29500,1.080705,0.003924,0.031367


# Save tables to csv:

In [141]:
DOC.to_csv('DOC.csv')
LIB.to_csv('LIB.csv')
VOCAB.to_csv('VOCAB.csv')
TOKEN.to_csv('TOKEN.csv')

## Save to SQLite:

In [None]:
#import sqlite3

#TOKEN2 = TOKEN.drop('pos_tuple', 1)

#with sqlite3.connect('mod4-corpus.db') as db:
#    DOC.to_sql('doc', db, index=True, if_exists='replace')
#    LIB.to_sql('lib', db, index=True, if_exists='replace')
#    VOCAB.to_sql('vocab', db, index=True, if_exists='replace')
#    TOKEN2.to_sql('token', db, index=True, if_exists='replace')

### Breakdown table to view most prevalent terms & POS by author:

In [187]:
author_terms = TOKEN.reset_index().merge(LIB.reset_index()[['book_id', 'author']], on='book_id')
author_terms = author_terms[~author_terms.term_str.isin(sw.index)]

In [192]:
calvin_terms = author_terms[author_terms.author == 'Calvin'].term_str.value_counts(ascending=False)
bunyan_terms = author_terms[author_terms.author == 'Bunyan'].term_str.value_counts(ascending=False)
edwards_terms = author_terms[author_terms.author == 'Edwards'].term_str.value_counts(ascending=False)
alexander_terms = author_terms[author_terms.author == 'Alexander'].term_str.value_counts(ascending=False)
lewis_terms = author_terms[author_terms.author == 'Lewis'].term_str.value_counts(ascending=False)
piper_terms = author_terms[author_terms.author == 'Piper'].term_str.value_counts(ascending=False)

In [213]:
calvin_terms.head(15)

god          6517
us           3840
christ       2863
may          2388
one          2151
lord         2110
would        1875
church       1815
man          1538
shall        1385
without      1384
therefore    1347
faith        1330
must         1305
also         1270
Name: term_str, dtype: int64

In [214]:
bunyan_terms.head(15)

mansoul      1293
said         1195
town          931
mr            924
also          847
thou          658
one           658
lord          657
upon          653
christian     621
man           615
would         582
come          577
shall         534
good          531
Name: term_str, dtype: int64

In [215]:
edwards_terms.head(15)

god       4269
christ    1777
things    1277
great     1190
may       1119
men        929
shall      823
love       813
man        789
much       757
spirit     755
would      742
one        697
upon       683
nature     678
Name: term_str, dtype: int64

In [216]:
alexander_terms.head(15)

would       468
moral       438
god         388
may         360
man         353
men         325
one         320
every       252
must        241
reason      211
could       202
truth       195
us          186
mind        185
religion    181
Name: term_str, dtype: int64

In [217]:
lewis_terms.head(15)

god       501
love      492
one       317
us        316
man       312
may       300
would     299
even      240
must      219
like      214
good      198
need      153
nature    145
life      136
say       135
Name: term_str, dtype: int64

In [218]:
piper_terms.head(15)

god       1203
christ     892
us         507
life       463
jesus      319
one        288
death      252
people     245
world      234
love       222
would      218
glory      196
work       182
joy        178
way        175
Name: term_str, dtype: int64