In [1]:
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary

In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Download 'punkt_tab' for sentence tokenization within word_tokenize
nltk.download('punkt_tab') # This line was added to download the required resource.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# On seperate states

Arunachal Pradesh

In [4]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/Corpus_states_Arunachal_Pradesh_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 1312207 documents.


In [5]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [6]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 8551 unique tokens.
Bag-of-words corpus contains 1312207 documents.


In [8]:
from gensim.models import LdaMulticore

In [10]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.230*"exif" + 0.107*"tiff" + 0.086*"photoshop" + 0.040*"xmpmeta" + 0.037*"macintosh" + 0.026*"cr" + 0.020*"wed" + 0.020*"core" + 0.020*"xmp" + 0.018*"column"
Topic 1: 0.105*"xap" + 0.090*"stream" + 0.073*"reference" + 0.073*"linkform" + 0.073*"stmfs" + 0.037*"referencestream" + 0.016*"youtube" + 0.016*"right" + 0.013*"heritage" + 0.013*"top"
Topic 2: 0.255*"xmlns" + 0.227*"http" + 0.112*"arunachal" + 0.016*"home" + 0.015*"culture" + 0.014*"day" + 0.013*"policy" + 0.009*"tribe" + 0.009*"tawang" + 0.009*"mechuka"
Topic 3: 0.218*"stevt" + 0.140*"adobe" + 0.036*"tourism" + 0.033*"circuit" + 0.031*"saved" + 0.031*"district" + 0.024*"state" + 0.017*"apply" + 0.012*"created" + 0.011*"land"
Topic 4: 0.399*"rdf" + 0.065*"seq" + 0.058*"description" + 0.029*"follow" + 0.026*"alt" + 0.026*"india" + 0.024*"valley" + 0.022*"video" + 0.013*"stay" + 0.013*"xml"
Topic 5: 0.147*"xapmm" + 0.070*"stref" + 0.055*"instanceid" + 0.055*"documentid" + 0.053*"resource" + 0.049*"placedresolutionunit" +

In [11]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/Corpus_states_Assam_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 28748 documents.


In [12]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [13]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 1576 unique tokens.
Bag-of-words corpus contains 28748 documents.


In [14]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.161*"information" + 0.091*"service" + 0.054*"budget" + 0.047*"act" + 0.047*"memorandum" + 0.045*"site" + 0.043*"screen" + 0.043*"festival" + 0.042*"faq" + 0.022*"type"
Topic 1: 0.234*"tourism" + 0.081*"govt" + 0.054*"link" + 0.051*"corporation" + 0.049*"ltd" + 0.048*"use" + 0.048*"detail" + 0.048*"governance" + 0.046*"accessibility" + 0.046*"privacy"
Topic 2: 0.160*"policy" + 0.139*"office" + 0.116*"home" + 0.084*"contact" + 0.040*"statement" + 0.040*"copyright" + 0.039*"skip" + 0.022*"organization" + 0.022*"form" + 0.020*"mission"
Topic 3: 0.131*"content" + 0.125*"portal" + 0.057*"notification" + 0.054*"main" + 0.053*"term" + 0.051*"disclaimer" + 0.050*"hyperlinking" + 0.027*"official" + 0.026*"centre" + 0.025*"key"
Topic 4: 0.278*"assam" + 0.190*"department" + 0.089*"government" + 0.024*"national" + 0.024*"time" + 0.022*"entire" + 0.022*"accessing" + 0.022*"last" + 0.022*"reviewed" + 0.021*"here"
Topic 5: 0.091*"document" + 0.072*"development" + 0.064*"organisation" + 0.05

In [15]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/Corpus_states_Goa_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 575649 documents.


In [16]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [17]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 5680 unique tokens.
Bag-of-words corpus contains 575649 documents.


In [18]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.074*"stay" + 0.073*"getting" + 0.056*"detail" + 0.055*"law" + 0.047*"service" + 0.040*"culture" + 0.038*"list" + 0.038*"bus" + 0.038*"wellness" + 0.038*"food"
Topic 1: 0.141*"tour" + 0.093*"gtdc" + 0.048*"mile" + 0.047*"vacancy" + 0.038*"old" + 0.037*"regenerative" + 0.035*"booking" + 0.034*"vasco" + 0.034*"mayem" + 0.033*"farmagudi"
Topic 2: 0.082*"contact" + 0.081*"adventure" + 0.066*"view" + 0.046*"panaji" + 0.045*"nature" + 0.045*"empanelment" + 0.044*"calangute" + 0.043*"experiential" + 0.043*"term" + 0.041*"condition"
Topic 3: 0.121*"jetty" + 0.060*"rti" + 0.046*"experience" + 0.044*"margao" + 0.044*"hotel" + 0.043*"airport" + 0.042*"around" + 0.041*"fact" + 0.040*"proposed" + 0.040*"clearance"
Topic 4: 0.301*"residency" + 0.072*"detailed" + 0.039*"festival" + 0.035*"newsletter" + 0.034*"trek" + 0.033*"environmental" + 0.033*"trade" + 0.033*"colva" + 0.033*"explore" + 0.033*"info"
Topic 5: 0.192*"notice" + 0.178*"tourism" + 0.064*"brief" + 0.059*"draft" + 0.034*"time" 

In [19]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/Corpus_states_Jammu_and_Kashmir_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 3014088 documents.


In [20]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [21]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 18890 unique tokens.
Bag-of-words corpus contains 3014088 documents.


In [22]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Topic 0: 0.080*"obj" + 0.041*"contact" + 0.040*"mandi" + 0.040*"mubarak" + 0.040*"raghunath" + 0.040*"wali" + 0.040*"shayam" + 0.039*"ram" + 0.039*"panjthirthi" + 0.033*"udhampur"
Topic 1: 0.179*"jammu" + 0.052*"utterbehni" + 0.052*"purmandel" + 0.041*"vaishno" + 0.036*"bahu" + 0.035*"mahal" + 0.034*"kalhuri" + 0.029*"samba" + 0.026*"chamliyal" + 0.025*"free"
Topic 2: 0.109*"tourism" + 0.040*"shri" + 0.037*"visit" + 0.035*"akhnoor" + 0.033*"talai" + 0.033*"ranbireshwar" + 0.032*"bawe" + 0.032*"khoh" + 0.026*"ramban" + 0.026*"board"
Topic 3: 0.234*"temple" + 0.102*"district" + 0.088*"fort" + 0.040*"palace" + 0.034*"peer" + 0.034*"rani" + 0.033*"pilgrimage" + 0.027*"poonch" + 0.024*"time" + 0.022*"detail"
Topic 4: 0.080*"devi" + 0.038*"kishtwar" + 0.037*"sardaran" + 0.037*"radhey" + 0.034*"rajouri" + 0.033*"destination" + 0.029*"main" + 0.028*"kashmir" + 0.020*"feel" + 0.020*"pota"
Topic 5: 0.081*"endobj" + 0.042*"balidan" + 0.041*"content" + 0.035*"travel" + 0.033*"kathua" + 0.030*"plac

In [23]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/Corpus_states_Karnataka_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 1010120 documents.


In [24]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [25]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 9531 unique tokens.
Bag-of-words corpus contains 1010120 documents.


In [26]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.110*"domestic" + 0.075*"district" + 0.074*"accommodation" + 0.057*"home" + 0.057*"park" + 0.056*"trip" + 0.056*"cuisine" + 0.056*"tranquility" + 0.055*"upcoming" + 0.055*"helplines"
Topic 1: 0.070*"plan" + 0.069*"adopt" + 0.068*"southern" + 0.068*"theme" + 0.068*"circuit" + 0.068*"fair" + 0.068*"competition" + 0.068*"virtual" + 0.044*"temple" + 0.028*"december"
Topic 2: 0.212*"download" + 0.212*"brochure" + 0.085*"hill" + 0.072*"package" + 0.037*"national" + 0.018*"kannada" + 0.013*"historical" + 0.013*"fort" + 0.012*"also" + 0.012*"quick"
Topic 3: 0.235*"roadshows" + 0.069*"english" + 0.067*"monument" + 0.062*"adventure" + 0.060*"food" + 0.059*"service" + 0.059*"press" + 0.027*"explore" + 0.024*"visit" + 0.021*"tip"
Topic 4: 0.151*"karnataka" + 0.087*"destination" + 0.085*"blog" + 0.084*"experience" + 0.082*"international" + 0.052*"thing" + 0.045*"heritage" + 0.042*"art" + 0.041*"eco" + 0.041*"photo"
Topic 5: 0.184*"tourism" + 0.119*"tour" + 0.057*"travel" + 0.046*"beach" +

In [27]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/Corpus_states_Kerala_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 8424 documents.


In [28]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [29]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 280 unique tokens.
Bag-of-words corpus contains 8424 documents.


In [30]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.056*"activity" + 0.056*"place" + 0.056*"backwater" + 0.049*"meet" + 0.049*"history" + 0.042*"inr" + 0.032*"kannur" + 0.025*"feast" + 0.025*"kuthampully" + 0.025*"water"
Topic 1: 0.064*"experience" + 0.059*"website" + 0.050*"hill" + 0.045*"photo" + 0.045*"official" + 0.025*"keralatourism" + 0.020*"riyas" + 0.020*"varkala" + 0.020*"exclusive" + 0.020*"handloom"
Topic 2: 0.277*"kerala" + 0.146*"tourism" + 0.058*"info" + 0.050*"responsible" + 0.038*"newsletter" + 0.034*"eco" + 0.034*"nivas" + 0.026*"government" + 0.017*"recruitment" + 0.017*"thiruvappana"
Topic 3: 0.104*"travel" + 0.059*"care" + 0.056*"trip" + 0.040*"kappa" + 0.039*"yathri" + 0.039*"agency" + 0.039*"beach" + 0.031*"package" + 0.031*"plan" + 0.025*"thing"
Topic 4: 0.069*"stay" + 0.063*"find" + 0.060*"explore" + 0.042*"tradition" + 0.042*"waterfall" + 0.036*"use" + 0.027*"judaism" + 0.027*"mission" + 0.027*"tender" + 0.021*"wallpaper"
Topic 5: 0.067*"video" + 0.067*"ayurveda" + 0.066*"festival" + 0.044*"wildlife" 

In [31]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/Corpus_states_Maharashtra_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 644875 documents.


In [32]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [33]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 5097 unique tokens.
Bag-of-words corpus contains 644875 documents.


In [34]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.234*"tourism" + 0.056*"beach" + 0.050*"tourist" + 0.043*"mumbai" + 0.036*"solapur" + 0.025*"room" + 0.023*"ihm" + 0.022*"also" + 0.017*"indian" + 0.016*"culture"
Topic 1: 0.151*"policy" + 0.084*"cave" + 0.028*"rural" + 0.025*"known" + 0.022*"dam" + 0.021*"cuisine" + 0.019*"water" + 0.015*"buldhana" + 0.015*"ahmednagar" + 0.015*"dish"
Topic 2: 0.137*"application" + 0.073*"temple" + 0.069*"incentive" + 0.033*"forest" + 0.028*"waterfall" + 0.023*"ministry" + 0.022*"pune" + 0.021*"citizen" + 0.021*"art" + 0.016*"located"
Topic 3: 0.027*"hill" + 0.027*"charter" + 0.019*"hindu" + 0.019*"office" + 0.018*"travel" + 0.016*"cultural" + 0.015*"thane" + 0.015*"visit" + 0.015*"incredible" + 0.015*"palghar"
Topic 4: 0.068*"fort" + 0.046*"india" + 0.022*"booking" + 0.017*"festival" + 0.015*"visitor" + 0.015*"raigad" + 0.015*"popular" + 0.014*"including" + 0.014*"nashik" + 0.011*"bhandara"
Topic 5: 0.125*"maharashtra" + 0.072*"adventure" + 0.070*"wildlife" + 0.046*"medium" + 0.031*"map" + 0

# On full corpus

In [None]:
# Load the merged corpus from .pkl file
with open('/content/drive/MyDrive/WM_Project/Corpus_states/merged_corpus.pkl', 'rb') as file:
    corpus = pickle.load(file)

print(f"Loaded corpus with {len(corpus)} documents.")

Loaded corpus with 6594111 documents.


In [None]:
# Preprocessing steps
# 1. Lowercase text
corpus = [doc.lower() for doc in corpus]

# 2. Remove stop words
stop_words = set(stopwords.words('english'))
# Modified this line to keep the document as a string
corpus = [' '.join([word for word in doc.split() if word not in stop_words]) for doc in corpus]

# 3. Tokenize (if not already done)
corpus = [word_tokenize(doc) for doc in corpus]

# 4. Remove non-alphabetic tokens
corpus = [[word for word in doc if word.isalpha()] for doc in corpus]

# 5. Remove short words
corpus = [[word for word in doc if len(word) > 2] for doc in corpus]

# 6. Lemmatize the words
lemmatizer = WordNetLemmatizer()
corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in corpus]

# 7. Create dictionary and bag-of-words corpus
dictionary = Dictionary(corpus)

In [None]:
# Create a bag-of-words representation
bow_corpus = [dictionary.doc2bow(doc) for doc in corpus]

print(f"Dictionary created with {len(dictionary)} unique tokens.")
print(f"Bag-of-words corpus contains {len(bow_corpus)} documents.")

Dictionary created with 35254 unique tokens.
Bag-of-words corpus contains 6594111 documents.


In [None]:
from gensim.models import LdaMulticore

In [None]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=10)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")


  perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)


Topic 0: 0.077*"beach" + 0.075*"wildlife" + 0.040*"sanctuary" + 0.026*"festival" + 0.025*"cuisine" + 0.024*"event" + 0.023*"visitor" + 0.022*"popular" + 0.021*"history" + 0.020*"fair"
Topic 1: 0.058*"fort" + 0.058*"cave" + 0.040*"india" + 0.034*"contact" + 0.029*"museum" + 0.022*"home" + 0.021*"palace" + 0.021*"directorate" + 0.019*"room" + 0.019*"map"
Topic 2: 0.153*"policy" + 0.093*"temple" + 0.069*"adventure" + 0.048*"mumbai" + 0.042*"forest" + 0.031*"booking" + 0.027*"hill" + 0.025*"art" + 0.021*"water" + 0.019*"including"
Topic 3: 0.101*"maharashtra" + 0.041*"medium" + 0.033*"waterfall" + 0.031*"rural" + 0.019*"interest" + 0.017*"hindu" + 0.017*"station" + 0.016*"park" + 0.016*"tender" + 0.015*"maha"
Topic 4: 0.078*"tourism" + 0.046*"application" + 0.022*"incentive" + 0.016*"tourist" + 0.012*"solapur" + 0.009*"ministry" + 0.008*"citizen" + 0.008*"charter" + 0.008*"ihm" + 0.007*"known"


In [None]:
# Apply LDA with Gensim
lda_model = LdaMulticore(bow_corpus, num_topics=6, id2word=dictionary, passes=10, workers=4)

# Print the topics found by LDA
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.062*"maharashtra" + 0.059*"beach" + 0.043*"museum" + 0.038*"hill" + 0.036*"district" + 0.035*"contact" + 0.029*"festival" + 0.028*"plan" + 0.027*"read" + 0.023*"forest"
Topic 1: 0.067*"adventure" + 0.055*"destination" + 0.052*"cave" + 0.047*"travel" + 0.043*"tourist" + 0.032*"heritage" + 0.026*"trip" + 0.026*"monument" + 0.023*"video" + 0.022*"ticket"
Topic 2: 0.094*"event" + 0.079*"download" + 0.079*"brochure" + 0.055*"home" + 0.039*"park" + 0.035*"thing" + 0.030*"form" + 0.030*"new" + 0.028*"opportunity" + 0.028*"latest"
Topic 3: 0.076*"policy" + 0.032*"india" + 0.025*"station" + 0.024*"cuisine" + 0.018*"solapur" + 0.016*"place" + 0.015*"wild" + 0.015*"photo" + 0.015*"spirituality" + 0.014*"sanctuary"
Topic 4: 0.172*"tourism" + 0.069*"application" + 0.047*"tour" + 0.035*"fort" + 0.032*"experience" + 0.027*"international" + 0.027*"domestic" + 0.025*"booking" + 0.024*"wildlife" + 0.021*"mumbai"
Topic 5: 0.067*"roadshows" + 0.063*"karnataka" + 0.055*"temple" + 0.044*"incentiv