##LDA

In [1]:
text = '''
Hey John, how are you doing today?
David? Is that you? Haven't seen you in a while. Where's Mary?
She's at the garden club meeting, John. Remember? We talked about it last week.
Last week? Did we? Feels like ages ago. Mind you, everything feels like ages ago these days.
I know it's tough, John. But that's why I'm here. Brought you some of your favorites – apples and that oatcake you like.
Oatcake! Now you're talking.Mary always hides them, says I eat them all at once.
Well, maybe I can convince her to let you have a little more freedom with the snacks today.  So, what have you been up to, champ?
This… this thing. Can't remember what it's called. Used to play chess all the time with… with… 
Your dad? You and your dad used to have epic chess battles. Remember how you'd always try to trick him with that sneaky queen move?
Dad? Right! Dad. Used to love playing with him. He always beat me, though. Smart man, your dad.
He was a sharp one, that's for sure. Hey, listen, how about we set up a game? Maybe I can get schooled by the master himself?
Master? Hardly. But… a game sounds good. Been a while since I've played. Remember the rules, though?
Of course, John. How about we play for the oatcake? Winner takes all?
Winner takes all? You're on! Don't underestimate the old dog, David.
Wait… where's the knight?
Right here, John. Next to your king. See? Remember, the knight moves in an L-shape.
Ah, the knight. Right. Used to love using the knight for surprise attacks. Like a sneaky little soldier.
My, look at the time! Mary will be home soon.
Don't worry about Mary, John. We still have a few moves left in us, don't we?  Besides, who knows, maybe you can use that sneaky knight to win the oatcake after all.
'''

In [2]:
!pip install gensim




[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize, sent_tokenize
df_sf = pd.read_csv('stylometric_features6.csv')


In [4]:
def lemmatize_stemming(text):
    wnl = WordNetLemmatizer()
    lem_word = wnl.lemmatize(text)
    ps = PorterStemmer()
    lem_stemmed_word = ps.stem(lem_word)
    return lem_word

def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            result.append(lemmatize_stemming(token))
    return result

In [5]:
print(len(word_tokenize(text)))

418


In [6]:
import re
def remove_newlines(text):
    cleaned_text = re.sub(r'\n', '', text)
    return cleaned_text

In [7]:
text = remove_newlines(text)

In [8]:
sents = sent_tokenize(text)

In [9]:
print(len(sents))


41


In [10]:
sents[2]

"Haven't seen you in a while."

###For first text

In [11]:
# Preprocess the text data
texts = [preprocess(text) for text in sents]

combined_data = pd.DataFrame({'Text': texts})

In [12]:
combined_data['Text']

0                             [hey, john, today, david]
1                                                    []
2                                         [haven, seen]
3                   [mary, garden, club, meeting, john]
4                                            [remember]
5                                  [talked, week, week]
6                                                    []
7                                [feel, like, age, ago]
8     [mind, feel, like, age, ago, day, know, tough,...
9                                                    []
10    [brought, favorite, apple, oatcake, like, oatc...
11    [talking, mary, hide, say, eat, maybe, convinc...
12                                       [champ, thing]
13                                   [remember, called]
14                             [play, chess, time, dad]
15                           [dad, epic, chess, battle]
16           [remember, try, trick, sneaky, queen, dad]
17                                              

In [13]:
dictionary = gensim.corpora.Dictionary(combined_data['Text'])

In [14]:
print(dictionary)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

Dictionary<89 unique tokens: ['david', 'hey', 'john', 'today', 'haven']...>
0 david
1 hey
2 john
3 today
4 haven
5 seen
6 club
7 garden
8 mary
9 meeting
10 remember


In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in combined_data['Text']]

In [16]:
bow_doc = bow_corpus[1]

for i in range(len(bow_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc[i][0],
                                                     dictionary[bow_doc[i][0]],
                                                     bow_doc[i][1]))

In [17]:
lda_model = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=10, id2word = dictionary, chunksize=20000, passes=2, workers = 2)

In [18]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

Topic: 0 
Words: 0.155*"dad" + 0.045*"remember" + 0.045*"play" + 0.045*"try" + 0.045*"sneaky" + 0.045*"smart" + 0.045*"queen" + 0.045*"trick" + 0.045*"sure" + 0.045*"chess"


Topic: 1 
Words: 0.141*"right" + 0.074*"left" + 0.074*"seen" + 0.074*"move" + 0.074*"haven" + 0.007*"remember" + 0.007*"dad" + 0.007*"king" + 0.007*"john" + 0.007*"beat"


Topic: 2 
Words: 0.070*"week" + 0.037*"mary" + 0.037*"little" + 0.037*"today" + 0.037*"maybe" + 0.037*"hide" + 0.037*"convince" + 0.037*"talking" + 0.037*"let" + 0.037*"eat"


Topic: 3 
Words: 0.088*"played" + 0.085*"ve" + 0.036*"dad" + 0.009*"right" + 0.009*"remember" + 0.009*"king" + 0.009*"beat" + 0.009*"john" + 0.009*"love" + 0.009*"oatcake"


Topic: 4 
Words: 0.105*"take" + 0.105*"winner" + 0.055*"john" + 0.055*"hey" + 0.055*"today" + 0.055*"david" + 0.055*"playing" + 0.055*"love" + 0.020*"like" + 0.020*"ago"


Topic: 5 
Words: 0.107*"knight" + 0.073*"mary" + 0.038*"move" + 0.038*"remember" + 0.038*"john" + 0.038*"love" + 0.038*"worry" + 0.

###For second text

In [19]:
# Preprocess the text data
texts = [preprocess(text) for text in sents2]

combined_data = pd.DataFrame({'Text': texts})

NameError: name 'sents2' is not defined

In [None]:
combined_data['Text']

In [None]:
dictionary = gensim.corpora.Dictionary(combined_data['Text'])

: 

In [None]:
print(dictionary)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

: 

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in combined_data['Text']]

: 

In [None]:
bow_doc = bow_corpus[1]

for i in range(len(bow_doc)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc[i][0],
                                                     dictionary[bow_doc[i][0]],
                                                     bow_doc[i][1]))

: 

In [None]:
lda_model2 = gensim.models.LdaMulticore(corpus=bow_corpus, num_topics=10, id2word = dictionary, chunksize=20000, passes=2, workers = 2)

: 

In [None]:
for idx, topic in lda_model2.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic))
    print("\n")

: 

: 

In [None]:
print(lda_model.print_topics(-1))
print(lda_model2.print_topics(-1))

: 

In [20]:
pip install pyldavis

Collecting pyldavis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
                                              0.0/2.6 MB ? eta -:--:--
     ----                                     0.3/2.6 MB 5.9 MB/s eta 0:00:01
     ---------                                0.6/2.6 MB 6.4 MB/s eta 0:00:01
     ----------------                         1.1/2.6 MB 7.6 MB/s eta 0:00:01
     ----------------------                   1.5/2.6 MB 8.5 MB/s eta 0:00:01
     ---------------------------              1.8/2.6 MB 7.7 MB/s eta 0:00:01
     --------------------------------         2.1/2.6 MB 7.5 MB/s eta 0:00:01
     -------------------------------------    2.4/2.6 MB 7.3 MB/s eta 0:00:01
     ---------------------------------------- 2.6/2.6 MB 7.2 MB/s eta 0:00:00
Collecting numpy>=1.24.2 (from pyldavis)
  Using cached numpy-1.26.4-cp310-cp310-win_amd64.whl (15.8 MB)
Collecting numexpr (from pyldavis)
  Downloading numexpr-2.10.0-cp310-cp310-win_amd64.whl (97 kB)
                           

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Urmi\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~-mpy\\.libs\\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import os

In [22]:
num_topics=10

In [23]:
import pyLDAvis.gensim
import pickle
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model2, bow_corpus, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

ModuleNotFoundError: No module named 'pyLDAvis'

###Jaccard similarity

In [None]:
topics_1 = []
for idx, topic in lda_model.print_topics(-1):
  split1 = topic.split('\"')
  topics_1.append(set([split1[val] for val in range(1,21,2)]))
topics_1

: 

In [None]:
topics_2 = []
for idx, topic in lda_model2.print_topics(-1):
  split1 = topic.split('\"')
  topics_2.append(set([split1[val] for val in range(1,21,2)]))
topics_2

: 

: 

In [None]:
def jaccard_similarity(vec1, vec2):
    intersection = set(vec1) & set(vec2)
    union = set(vec1) | set(vec2)
    similarity = len(intersection) / len(union)
    return similarity

: 

In [None]:
similarity_matrix = []
for i in topics_1:
  temp = []
  for j in topics_2:
    sim = jaccard_similarity(i,j)
    temp.append(sim)
  similarity_matrix.append(temp)
similarity_matrix

: 