### Topic Modeling & LDA
- find word topics by author & text

#### Gender & Era
discuss....

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
import plotly_express as px
%matplotlib inline

In [2]:
#hide warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
n_terms = 8000
n_topics = 35
max_iter = 5
OHCO = ['title', 'author', 'para_num', 'gender', 'era']

In [4]:
#import tables
TOKENS = pd.read_csv('TOKEN2.csv')

In [5]:
TOKENS.head()

Unnamed: 0,text_id,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,term_id,title,author,file
0,1,16,0,0,"('Every', 'DT')",DT,Every,every,15890,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt
1,1,16,0,1,"('art', 'NN')",NN,art,art,2942,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt
2,1,16,0,2,"('and', 'CC')",CC,and,and,1976,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt
3,1,16,0,3,"('every', 'DT')",DT,every,every,15890,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt
4,1,16,0,4,"('inquiry,', 'NN')",NN,"inquiry,",inquiry,23364,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt


In [6]:
#create GENDER column in tokens table 
TOKENS.loc[TOKENS.author.str.contains('Aristotle'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('bellhooks'), 'gender'] = 'F'
TOKENS.loc[TOKENS.author.str.contains('Cicero'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Foucault'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Freire'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('HannahArendt'), 'gender'] = 'F'
TOKENS.loc[TOKENS.author.str.contains('HarrietTaylorMill'), 'gender'] = 'F'
TOKENS.loc[TOKENS.author.str.contains('Hesse'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Hume'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Kant'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Kierkegaard'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Laozi'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Marx'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('MaryWollstonecraft'), 'gender'] = 'F'
TOKENS.loc[TOKENS.author.str.contains('Nietzsche'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Plato'), 'gender'] = 'M'
TOKENS.loc[TOKENS.author.str.contains('Simonedebeauvoir'), 'gender'] = 'F'
TOKENS.loc[TOKENS.author.str.contains('StuartMill'), 'gender'] = 'M'

In [7]:
#create ERA column in tokens table - ancient, classical, modern
TOKENS.loc[TOKENS.author.str.contains('Aristotle'), 'era'] = 'ancient'
TOKENS.loc[TOKENS.author.str.contains('bellhooks'), 'era'] = 'modern'
TOKENS.loc[TOKENS.author.str.contains('Cicero'), 'era'] = 'ancient'
TOKENS.loc[TOKENS.author.str.contains('Foucault'), 'era'] = 'modern'
TOKENS.loc[TOKENS.author.str.contains('Freire'), 'era'] = 'modern'
TOKENS.loc[TOKENS.author.str.contains('HannahArendt'), 'era'] = 'modern'
TOKENS.loc[TOKENS.author.str.contains('HarrietTaylorMill'), 'era'] = 'classical'
TOKENS.loc[TOKENS.author.str.contains('Hesse'), 'era'] = 'modern'
TOKENS.loc[TOKENS.author.str.contains('Hume'), 'era'] = 'classical'
TOKENS.loc[TOKENS.author.str.contains('Kant'), 'era'] = 'classical'
TOKENS.loc[TOKENS.author.str.contains('Kierkegaard'), 'era'] = 'modern'
TOKENS.loc[TOKENS.author.str.contains('Laozi'), 'era'] = 'ancient'
TOKENS.loc[TOKENS.author.str.contains('Marx'), 'era'] = 'classical'
TOKENS.loc[TOKENS.author.str.contains('MaryWollstonecraft'), 'era'] = 'classical'
TOKENS.loc[TOKENS.author.str.contains('Nietzsche'), 'era'] = 'classical'
TOKENS.loc[TOKENS.author.str.contains('Plato'), 'era'] = 'ancient'
TOKENS.loc[TOKENS.author.str.contains('Simonedebeauvoir'), 'era'] = 'modern'
TOKENS.loc[TOKENS.author.str.contains('StuartMill'), 'era'] = 'classical'

In [8]:
TOKENS

Unnamed: 0,text_id,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,term_id,title,author,file,gender,era
0,1,16,0,0,"('Every', 'DT')",DT,Every,every,15890,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt,M,ancient
1,1,16,0,1,"('art', 'NN')",NN,art,art,2942,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt,M,ancient
2,1,16,0,2,"('and', 'CC')",CC,and,and,1976,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt,M,ancient
3,1,16,0,3,"('every', 'DT')",DT,every,every,15890,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt,M,ancient
4,1,16,0,4,"('inquiry,', 'NN')",NN,"inquiry,",inquiry,23364,NicomachaenEthics,Aristotle,philostexts\Aristotle_NicomachaenEthics-1.txt,M,ancient
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1742815,3,179,0,7,"('working', 'VBG')",VBG,working,working,49741,FeministClassStruggle,bellhooks,philostexts\bellhooks_FeministClassStruggle-3.txt,F,modern
1742816,3,179,0,8,"('class', 'NN')",NN,class,class,7384,FeministClassStruggle,bellhooks,philostexts\bellhooks_FeministClassStruggle-3.txt,F,modern
1742817,3,179,0,9,"('and', 'CC')",CC,and,and,1976,FeministClassStruggle,bellhooks,philostexts\bellhooks_FeministClassStruggle-3.txt,F,modern
1742818,3,179,0,10,"('poor', 'JJ')",JJ,poor,poor,34409,FeministClassStruggle,bellhooks,philostexts\bellhooks_FeministClassStruggle-3.txt,F,modern


In [9]:
#Convert TOKENS to table of paragraphs
#group by paragraph string and set indexes
PARAS = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
    .groupby(OHCO).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'para_str'})
PARAS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,para_str
title,author,para_num,gender,era,Unnamed: 5_level_1
AVindicationOfTheRightsofMen,MaryWollstonecraft,75,F,classical,attention transient topic day
AVindicationOfTheRightsofMen,MaryWollstonecraft,76,F,classical,amusement indignation
AVindicationOfTheRightsofMen,MaryWollstonecraft,77,F,classical,arguments moment me shape
AVindicationOfTheRightsofMen,MaryWollstonecraft,78,F,classical,feelings sense
AVindicationOfTheRightsofMen,MaryWollstonecraft,80,F,classical,pages letter effusions moment


In [10]:
'''#Convert TOKENS to table of texts
#group by text string and set indexes
TEXTS = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
    .groupby(OHCO[:1]).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'text_str'})
TEXTS.head()'''

"#Convert TOKENS to table of texts\n#group by text string and set indexes\nTEXTS = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]    .groupby(OHCO[:1]).term_str    .apply(lambda x: ' '.join(x))    .to_frame()    .rename(columns={'term_str':'text_str'})\nTEXTS.head()"

#### Create Vector Space 


In [11]:
##Vector Space
#Use Scikit Learn's CountVectorizer to convert F1 corpus of paragraphs
#into a document-term vector space of word counts.
tfv = CountVectorizer(max_features=n_terms, stop_words='english')
tf = tfv.fit_transform(PARAS.text_str)
TERMS = tfv.get_feature_names_out()

NameError: name 'TEXTS' is not defined

In [None]:
'''##Vector Space for Paragraph Level
#Use Scikit Learn's CountVectorizer to convert F1 corpus of paragraphs
#into a document-term vector space of word counts.
tfv_para = CountVectorizer(max_features=n_terms, stop_words='english')
tf_para = tfv_para.fit_transform(PARAS.para_str)
TERMS_para = tfv_para.get_feature_names_out()'''

In [None]:
'''##Vector Space for TEXT Level
#Use Scikit Learn's CountVectorizer to convert F1 corpus of paragraphs
#into a document-term vector space of word counts.
tfv_text = CountVectorizer(max_features=n_terms, stop_words='english')
tf_text = tfv_text.fit_transform(TEXTS.text_str)
TERMS_text = tfv_text.get_feature_names_out()'''

#### Generate LDA Model

In [None]:
#Generate LDA model
#SUse cikit Learn's LatentDirichletAllocation algorithm and extract the THETA and PHI tables.
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

#### THETA

In [None]:
#THETA for texts
THETA = pd.DataFrame(lda.fit_transform(tf), index=PARAS.index)
THETA.columns.name = 'topic_id'
THETA.sample(10).style.background_gradient()

In [None]:
#THETA for paragraphs
'''THETA_para = pd.DataFrame(lda.fit_transform(tf_para), index=PARAS.index)
THETA_para.columns.name = 'topic_id'
THETA_para.sample(10).style.background_gradient()'''

In [None]:
#THETA for texts
'''THETA_text = pd.DataFrame(lda.fit_transform(tf_text), index=TEXTS.index)
THETA_text.columns.name = 'topic_id'
THETA_text.sample(10).style.background_gradient()'''

#### PHI

In [None]:
#PHI 
PHI = pd.DataFrame(lda.components_, columns=TERMS)
PHI.index.name = 'topic_id'
PHI.columns.name  = 'term_str'
PHI.T.head().style.background_gradient()

In [None]:
#PHI for paragraphs
'''PHI_para = pd.DataFrame(lda.components_, columns=TERMS)
PHI_para.index.name = 'topic_id'
PHI_para.columns.name  = 'term_str'
PHI_para.T.head().style.background_gradient()'''

In [None]:
#PHI for texts
'''PHI_text = pd.DataFrame(lda.components_, columns=TERMS)
PHI_text.index.name = 'topic_id'
PHI_text.columns.name  = 'term_str'
PHI_text.T.head().style.background_gradient()'''

#### Top Terms per Topic

In [None]:
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(10)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)
TOPICS

In [None]:
#TOPICS_para['label'] = TOPICS_para.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)

In [None]:
'''TOPICS_para = PHI_para.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(10)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)
TOPICS_para'''

In [None]:
TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)

In [None]:
#TOPICS_para['label'] = TOPICS_para.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)

In [None]:
'''TOPICS_text = PHI_text.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(10)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)
TOPICS_text'''

In [None]:
#TOPICS_text['label'] = TOPICS_para.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)

In [None]:
#TOPICS_para.label

In [None]:
#TOPICS_text.label

#### Sort Topics by Doc Weight

In [None]:
TOPICS['doc_weight_sum'] = THETA.sum()
TOPICS.sort_values('doc_weight_sum', 
                   ascending=True).plot.barh(y='doc_weight_sum', x='label', 
                                             figsize=(5,10)) 

In [None]:
#TOPICS_para['doc_weight_sum'] = THETA_para.sum()
#TOPICS_para.sort_values('doc_weight_sum', 
#                   ascending=True).plot.barh(y='doc_weight_sum',
#                                             x='label', 
#                                             figsize=(5,10)) 

In [None]:
#TOPICS_text['doc_weight_sum'] = THETA_text.sum()
#TOPICS_text.sort_values('doc_weight_sum', 
#                   ascending=True).plot.barh(y='doc_weight_sum',
#                                             x='label', 
#                                             figsize=(5,10)) 

#### Cluster Topics

In [None]:
def plot_tree(tree, labels):
    plt.figure()
    fig, axes = plt.subplots(figsize=(5, 10))
    dendrogram = sch.dendrogram(tree, labels=labels, orientation="left")
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
SIMS = pdist(normalize(PHI), metric='euclidean')
TREE = sch.linkage(SIMS, method='ward')

TOPICS['topterms'] = TOPICS[[i for i in range(10)]].apply(lambda x: ' '.join(x), 1)
labels  = ["{}: {}".format(a,b) for a, b in zip(TOPICS.index,  TOPICS.topterms.tolist())]
plot_tree(TREE, labels)

### Explore Topics by Gender

In [None]:
topic_cols = [t for t in range(n_topics)]
GENDER = THETA.groupby('gender')[topic_cols].mean().T                                            
GENDER.index.name = 'topic_id'
GENDER.T

In [None]:
GENDER['topterms'] = TOPICS[[i for i in range(10)]].apply(lambda x: ' '.join(x), 1)
GENDER.sort_values('F', ascending=False).style.background_gradient()

In [None]:
GENDER.sort_values('M', ascending=False).style.background_gradient()

In [None]:
px.scatter(GENDER.reset_index(), 'M', 'F', hover_name='topterms', text='topic_id')\
    .update_traces(mode='text')

#### Cluster

In [None]:
labels  = ["{}: {}".format(a,b) for a, b in zip(GENDER.index,  GENDER.topterms.tolist())]
plot_tree(TREE, labels)

#### Analysis
....

### Explore Topics by Era

In [None]:
ERA = THETA.groupby('era')[topic_cols].mean().T                                            
ERA.index.name = 'topic_id'
ERA.T

In [None]:
ERA['topterms'] = TOPICS[[i for i in range(10)]].apply(lambda x: ' '.join(x), 1)
ERA.sort_values('modern', ascending=False).style.background_gradient()

In [None]:
ERA.sort_values('ancient', ascending=False).style.background_gradient()

In [None]:
ERA.sort_values('classical', ascending=False).style.background_gradient()

In [None]:
#scatter plot looking at distribution of topics of ancient and classical texts
px.scatter(ERA.reset_index(), 'ancient', 'classical', hover_name='topterms', text='topic_id')\
    .update_traces(mode='text')

In [None]:
#scatter plot looking at distribution of topics of classical and modern texts
px.scatter(ERA.reset_index(), 'classical', 'modern', hover_name='topterms', text='topic_id')\
    .update_traces(mode='text')

In [None]:
#scatter plot looking at distribution of topics of ancient and modern texts
px.scatter(ERA.reset_index(), 'ancient', 'modern', hover_name='topterms', text='topic_id')\
    .update_traces(mode='text')

In [None]:
labels  = ["{}: {}".format(a,b) for a, b in zip(ERA.index,  ERA.topterms.tolist())]
plot_tree(TREE, labels)
