# Exploratory Text Analysis: Creating Tables

**Student:** Ian Yung  
**Class:** DS 5001

---

## Introduction

Welcome to my exploratory text analysis project! In this notebook, I'll be diving into the creation of BOW, DTM, TFIDF, and reduced TFIDF tables.

---

## Table of Contents

1. [Introduction](#Introduction)
2. [Load Data](#Load-Data)
3. [Generate BOW](#Generate-BOW)
4. [Generate DTM](#Generate-DTM)
5. [Generate Reduced/Normalized TFIDF_L2](#Generate-Reduced/Normalized-TFIDF_L2)


Feel free to navigate through the sections using the links provided above.


In [1]:
# a little overkill with the imports but I'm taking no chances
import pandas as pd
import numpy as np
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import plotly_express as px
import seaborn as sns
from IPython.display import display, HTML

import re

import nltk
from nltk import pos_tag

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import gensim
from scipy.linalg import norm, eigh
from gensim.corpora import Dictionary
from gensim.models import LdaModel, word2vec
from sklearn.manifold import TSNE as tsne

# Load Data

In [2]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [3]:
CORPUS = pd.read_csv("data/CORPUS.csv")
VOCAB = pd.read_csv("data/VOCAB.csv")
CORPUS.set_index(OHCO, inplace=True)

# Generate BOW

In [4]:
'''
Borrowed from M05_HW_KEY.ipynb!

Generates a BOW given a CORPUS and a bag.
'''
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

'''
Borrowed from M06_HW_KEY.ipynb!

Generates both TFIDF and DFIDF values from BOW.
'''
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF

In [5]:
CHAP = OHCO[:2]
BOW = create_bow(CORPUS, CHAP)
TFIDF, df = get_tfidf(BOW)

In [6]:
BOW['tfidf'] = TFIDF.stack()
VOCAB.dropna(subset=['term_str'], inplace=True)
VOCAB.set_index('term_str', inplace=True)
VOCAB['dfidf'] = df

In [7]:
# 20 most-significant words according to DFIDF value
VOCAB.reset_index().sort_values('dfidf', ascending=False).term_str.head(20)

11940         lets
19925        swift
12029       likely
23120         wise
22608        watch
18192         sign
1841          bare
16155       raised
6738       enemies
12131       listen
18930        speed
8188        forget
23511        youve
5570     direction
21128      trouble
645          added
15627    presently
4960        danger
3349       carried
20019        tales
Name: term_str, dtype: object

In [8]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tfidf
book_id,chap_num,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,111,1,0.013909
1,0,130,1,0.012989
1,0,22nd,2,0.034847
1,0,3,1,0.006439
1,0,33,1,0.011199
...,...,...,...,...
14,16,yet,2,0.002129
14,16,you,38,0.018852
14,16,young,1,0.005594
14,16,your,2,0.001741


# Generate DTM

In [9]:
TFIDF.fillna(0).groupby('chap_num').mean().stack() # MUST FILLNA

chap_num  term_str
0         0           0.000000
          1           0.001234
          10          0.005488
          100         0.003645
          10022       0.000000
                        ...   
47        þa          0.000000
          þisses      0.000000
          þunor       0.000000
          þá          0.000000
          þórr        0.000000
Length: 849348, dtype: float64

In [10]:
'''
Borrowed from M00_01_VectorizationWithSKLearn.ipynb

Simply condenses the CORPUS to the desired OHCO level.
'''
def gather_docs(CORPUS, ohco_level, term_col='term_str'):
    OHCO = CORPUS.index.names
    CORPUS[term_col] = CORPUS[term_col].astype('str')
    DOC = CORPUS.groupby(OHCO[:ohco_level])[term_col].apply(lambda x:' '.join(x)).to_frame('doc_str')
    return DOC

DOC = gather_docs(CORPUS, 3) # change the second parameter for different BAG

# Calculates the number of tokens in each cell
DOC['n_tokens'] = DOC.doc_str.apply(lambda x: len(x.split()))

In [11]:
# mutable variables
ngram_range = (1,2)
n_terms = 4000

count_engine = CountVectorizer(
    stop_words = 'english',
    ngram_range = ngram_range,
    max_features = n_terms)

X = count_engine.fit_transform(DOC.doc_str)

# Create the DTM from the transformed OHCO
DTM = pd.DataFrame(X.toarray(), 
                   columns=count_engine.get_feature_names_out(), 
                   index=DOC.index)

In [12]:
DTM

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,10,11,12,13,15,16,17,18,19,20,...,young,younger,youre,youth,youve,youve got,ælfwine,éomer,éowyn,úrin
book_id,chap_num,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,16,54,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,16,55,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,16,56,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,16,57,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Generate Reduced/Normalized TFIDF_L2

In [13]:
tfidf_engine = TfidfTransformer(norm='l2', use_idf=True)

X1 = tfidf_engine.fit_transform(DTM)

TFIDF_L2 = pd.DataFrame(X1.toarray(), columns=DTM.columns, index=DTM.index)

In [14]:
TFIDF_L2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,10,11,12,13,15,16,17,18,19,20,...,young,younger,youre,youth,youve,youve got,ælfwine,éomer,éowyn,úrin
book_id,chap_num,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.193233,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.218248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.124671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Data

In [15]:
BOW.to_csv("data/BOW.csv")

In [16]:
VOCAB.to_csv("data/VOCAB_imp.csv")

In [17]:
DTM.to_csv("data/DTM.csv")

In [18]:
TFIDF.to_csv("data/TFIDF.csv")

In [19]:
TFIDF_L2.to_csv("data/TFIDF_L2.csv")