### Bag-of-Words & TF-IDF Matrix
Bag-of-Words (BoW) places all the words of each text into a 'bucket' or 'bag'. Because of this method, information about the stucture of the sentence is lost. 

TF-IDF stands for Term Frequency - Inverse Document Frequency. TF-IDF scores the relative importance of the words, in order to gain an understanding of the texts as a whole. 

The Term Frequency (TF) is the number of times a word appears in the document, divided by the total number of words in the document.

![image.png](attachment:49da2fe1-bf99-43c8-8034-8fb1bb1cd99c.png)

Inverse Document Frequency (IDF) is the log of the number of documents divided by the number of documents that contain a particular word. The IDF finds the weight of rare words across all the documents in the corpus. 

![image.png](attachment:feb7ec5e-60a3-4540-9ba6-6a2f91113c36.png)

TF-IDF is TF multiplied by IDF. 
![image.png](attachment:f327ac95-52e7-474f-9c38-669e25825460.png)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px

In [2]:
sns.set()
%matplotlib inline

In [26]:
#TFIDF arguments
count_method = 'n' # 'c' or 'n' # n tokens, c = distinct token (term) count
tf_method = 'sum' # sum, max, log, double_norm, raw, binary
tf_norm_k = .5 # only used for double_norm
idf_method = 'standard' # standard, max, smooth
gradient_cmap = 'YlGnBu'

In [6]:
#set OHCO params
OHCO = ['text_id', 'para_num', 'sent_num', 'token_num']
SENTS = OHCO[:3]
PARAS = OHCO[:2]
TEXTS = OHCO[:1]

In [7]:
#Texts as bag
bag = TEXTS

In [8]:
#import tables
LIB = pd.read_csv('LIB.csv').set_index(TEXTS)
TOKEN = pd.read_csv('TOKEN.csv').set_index(OHCO)
VOCAB = pd.read_csv('VOCAB.csv').set_index('term_id')
# DOC = pd.read_csv("DOC.csv")

In [19]:
TOKEN.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,pos_tuple,pos,token_str,term_str,term_id
text_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16,0,0,"('Every', 'DT')",DT,Every,every,17469
1,16,0,1,"('art', 'NN')",NN,art,art,4390
1,16,0,2,"('and', 'CC')",CC,and,and,3416
1,16,0,3,"('every', 'DT')",DT,every,every,17469
1,16,0,4,"('inquiry,', 'NN')",NN,"inquiry,",inquiry,25045


In [20]:
LIB.head()

Unnamed: 0_level_0,text_title,text_author,text_file
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt
4,OnDuties,Cicero,philostexts\Cicero_OnDuties-4.txt
5,TheOrderofThings,Foucault,philostexts\Foucault_TheOrderofThings-5.txt
6,PedagogyOfTheOppressed,Freire,philostexts\Freire_PedagogyOfTheOppressed-6.txt
7,TheOriginsofTotalitarianism,HannahArendt,philostexts\HannahArendt_TheOriginsofTotalitar...


In [21]:
VOCAB.head()

Unnamed: 0_level_0,term_str,n,num,stop,p_stem,pos_max
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,28,1,0,0,CD
2,0,13,1,0,0,CD
3,1,6,1,0,1,CD
4,11,1,1,0,11,CD
5,13,1,1,0,13,CD


In [14]:
#remove any missing values
VOCAB = VOCAB[~VOCAB.term_str.isna()]
TOKEN = TOKEN[~TOKEN.term_str.isna()]

In [27]:
#add term rank to VOCAB
if 'term_rank' not in VOCAB.columns:
    VOCAB = VOCAB.sort_values('n', ascending=False).reset_index()
    VOCAB.index.name = 'term_rank'
    VOCAB = VOCAB.reset_index()
    VOCAB = VOCAB.set_index('term_id')
    VOCAB['term_rank'] = VOCAB['term_rank'] + 1

In [28]:
#groups words by term count
new_rank = VOCAB.n.value_counts()\
    .sort_index(ascending=False).reset_index().reset_index()\
    .rename(columns={'level_0':'term_rank2', 'index':'n', 'n':'nn'})\
    .set_index('n')

VOCAB['term_rank2'] = VOCAB.n.map(new_rank.term_rank2) + 1
VOCAB['p'] = VOCAB.n / VOCAB.shape[0]

In [29]:
#BOW
BOW = TOKEN.groupby(bag+['term_id']).term_id.count()\
    .to_frame().rename(columns={'term_id':'n'})
BOW['c'] = BOW.n.astype('bool').astype('int')
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,c
text_id,term_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,18,29,1
1,19,7,1
1,40,7,1
1,55,6,1
1,79,4,1


In [33]:
#Build count matrix
DTCM = BOW[count_method].unstack().fillna(0).astype('int')
#DTCM.head()

In [39]:
#function to compute TF (Term Frequency)
#group by Bag (TEXT)
def compute_TF(tf_method, DTCM):
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log10(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] # EXPLAIN; may defeat purpose of norming
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')  
    TF = TF.T  
    return TF

In [38]:
#call function
compute_TF(tf_method, DTCM).head()

term_id,1,2,3,4,5,6,7,8,9,10,...,52115,52116,52117,52118,52119,52120,52121,52122,52123,52124
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.4e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1e-05,1e-05,1e-05,1e-05,0.0,0.0,1e-05,1e-05,1.9e-05
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
#Function to compute DF (document frequency) and IDF (inverse document frequency)

def compute_IDF(DTCM, idf_method):
    #compute DF 
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0] #define shape of matrix
    
    #compute IDF
    if idf_method == 'standard':
        IDF = np.log10(N / DF)
    elif idf_method == 'max':
        IDF = np.log10(DF.max() / DF) 
    elif idf_method == 'smooth':
        IDF = np.log10((1 + N) / (1 + DF)) + 1 
    
    return IDF 

In [51]:
compute_IDF(DTCM, idf_method).head()

term_id
1    0.823909
2    1.000000
3    1.301030
4    1.301030
5    1.301030
dtype: float64

In [48]:
#compute TF-IDF (term frequency–inverse document frequency)
def compute_TFIDF(TF, IDF):
    TFIDF = TF * IDF
    return TFIDF 

In [47]:
compute_TFIDF(TF, IDF).head()

term_id,1,2,3,4,5,6,7,8,9,10,...,52115,52116,52117,52118,52119,52120,52121,52122,52123,52124
text_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.3e-05,1.3e-05,1.3e-05,1.3e-05,0.0,0.0,1.3e-05,1.3e-05,2.5e-05
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
