# Derived Tables
Creating BOW, DTM, and TFIDF representations of the corpus

In [1]:
# import libraries and setup
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px
from numpy.linalg import norm

OHCO = ["book_id", "chap_id", "para_num", "sent_num", "token_num"]

bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)
bag = 'CHAPS'

output_dir = "/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output"

In [2]:
# import tables 
LIB = pd.read_csv(f"{output_dir}/lib.csv").set_index('book_id')
TOKEN = pd.read_csv(f'{output_dir}/corpus.csv').set_index(OHCO).dropna()
VOCAB = pd.read_csv(f'{output_dir}/vocab.csv').set_index('term_str').dropna()

## Creating BOW from TOKEN

In [3]:
BOW = TOKEN.groupby(bags[bag]+['term_str']).term_str.count().to_frame('n') 
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_id,term_str,Unnamed: 3_level_1
BetweenTheActs,1,a,59
BetweenTheActs,1,about,7
BetweenTheActs,1,above,1
BetweenTheActs,1,across,1
BetweenTheActs,1,actual,1
...,...,...,...
ToTheLighthouse,43,with,4
ToTheLighthouse,43,without,1
ToTheLighthouse,43,would,2
ToTheLighthouse,43,wreath,1


## Creating DTM (document-term count matrix) from BOW

In [4]:
DTCM = BOW.n.unstack(fill_value=0)
DTCM.head(10)

Unnamed: 0_level_0,term_str,1,10,1030,10th,112,1215,12th,1397,1580,1586,...,zone,zoo,zoological,zoology,zoom,zwinglers,à,éclair,éclairs,êtres
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BetweenTheActs,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BetweenTheActs,10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Creating TFIDF table

In [5]:
# 1. compute TF

# define some methods
tf = {
    'sum': (DTCM.T / DTCM.T.sum()).T,
    'max': (DTCM.T / DTCM.T.max()).T,
    'log': (np.log2(1 + DTCM.T)).T,
    'raw':  DTCM,
    'double_norm': (DTCM.T / DTCM.T.max()).T,
    'binary': DTCM.T.astype('bool').astype('int').T
}

# define parameter
tf_method = 'sum'

# compute TF
TF = tf[tf_method]
TF

Unnamed: 0_level_0,term_str,1,10,1030,10th,112,1215,12th,1397,1580,1586,...,zone,zoo,zoological,zoology,zoom,zwinglers,à,éclair,éclairs,êtres
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BetweenTheActs,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ToTheLighthouse,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ToTheLighthouse,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ToTheLighthouse,41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ToTheLighthouse,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# 2. compute DF
DF = DTCM.astype('bool').sum() 
DF

term_str
1            4
10           1
1030         1
10th         1
112          1
            ..
zwinglers    2
à            1
éclair       1
éclairs      1
êtres        1
Length: 24286, dtype: int64

In [7]:
# 3. compute IDF

# define some methods
N = DTCM.shape[0]
idf = {
    'standard': np.log2(N / DF),
    'max': np.log2(DF.max() / DF),
    'smooth': np.log2((1 + N) / (1 + DF)) + 1
}

# define parameter
idf_method = 'standard'

# compute IDF
IDF = idf[idf_method]
IDF

term_str
1            5.707359
10           7.707359
1030         7.707359
10th         7.707359
112          7.707359
               ...   
zwinglers    6.707359
à            7.707359
éclair       7.707359
éclairs      7.707359
êtres        7.707359
Length: 24286, dtype: float64

In [8]:
# 4. compute TFIDF
TFIDF = TF * IDF
TFIDF

Unnamed: 0_level_0,term_str,1,10,1030,10th,112,1215,12th,1397,1580,1586,...,zone,zoo,zoological,zoology,zoom,zwinglers,à,éclair,éclairs,êtres
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BetweenTheActs,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BetweenTheActs,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ToTheLighthouse,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ToTheLighthouse,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ToTheLighthouse,41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ToTheLighthouse,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Finalize derived tables

In [9]:
# adding to VOCAB table
VOCAB['df'] = DF
VOCAB['idf'] = IDF

# adding to BOW table
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()

In [10]:
# see top 20 VOCAB words by DFIDF
VOCAB['dfidf'] = VOCAB['df'] * VOCAB ['idf']
VOCAB.reset_index().sort_values('dfidf', ascending=False)[['term_str', 'dfidf']].set_index('term_str').head(20)

Unnamed: 0_level_0,dfidf
term_str,Unnamed: 1_level_1
picture,110.92409
happy,110.92409
answer,110.92409
faces,110.92409
stop,110.92409
paused,110.92409
sure,110.92409
fixed,110.92409
drawn,110.92409
legs,110.92409


## Reducing and normalizing TFDIF

In [11]:
# reduce to top 1000 terms by DFIDF
top_1000 = VOCAB.sort_values('dfidf', ascending=False).head(1000).index
TFIDF_REDUCED = TFIDF[top_1000]

# applying L2 normalization
L2 = TFIDF_REDUCED.apply(lambda x: x / norm(x), 1)
L2

Unnamed: 0_level_0,term_str,picture,happy,answer,faces,stop,paused,sure,fixed,drawn,legs,...,curled,golden,emotions,parties,children,anything,coming,end,morning,yellow
book_id,chap_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BetweenTheActs,1,0.000000,0.000000,0.000000,0.0,0.000000,0.135901,0.000000,0.000000,0.000000,0.000000,...,0.075012,0.000000,0.0,0.0,0.000000,0.049590,0.000000,0.000000,0.097687,0.024422
BetweenTheActs,2,0.116686,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.058343,0.000000,0.058343,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.031934,0.000000,0.125812,0.062906
BetweenTheActs,3,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.041454,0.000000,0.041454,0.000000,0.040829,0.000000
BetweenTheActs,4,0.000000,0.000000,0.033218,0.0,0.000000,0.000000,0.000000,0.099655,0.033218,0.000000,...,0.000000,0.055006,0.0,0.0,0.054546,0.000000,0.018182,0.000000,0.053725,0.035817
BetweenTheActs,5,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.088772,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.097179,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ToTheLighthouse,39,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ToTheLighthouse,40,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.056030,...,0.092779,0.000000,0.0,0.0,0.000000,0.030668,0.000000,0.061336,0.000000,0.000000
ToTheLighthouse,41,0.153377,0.000000,0.000000,0.0,0.051126,0.000000,0.051126,0.025563,0.000000,0.025563,...,0.000000,0.000000,0.0,0.0,0.083950,0.027983,0.027983,0.013992,0.082686,0.000000
ToTheLighthouse,42,0.000000,0.082012,0.000000,0.0,0.000000,0.000000,0.041006,0.000000,0.000000,0.041006,...,0.067902,0.000000,0.0,0.0,0.044889,0.044889,0.000000,0.044889,0.000000,0.022107


## Saving tables

In [12]:
BOW.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/bow.csv")
DTCM.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/dtcm.csv")
TFIDF.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/tfidf.csv")
L2.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/tfidf_l2.csv")