**Content: **


1.   Basic clustering
2.   Basic clustering + POS tag and NER tags
3.   Topic Modeling LDA + evaluating with Coherence




In [None]:
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


**1) BASIC CLUSTERING - TEXTS**

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Sample data (replace with your dataset)
documents = [
    "this is the first sentence",
    "this sentence is similar to the first sentence",
    "this is yet another sentence",
    "this is a different kind of sentence",
    "and this is a completely different topic",
    "but this one is similar to the previous topic",
    "and here is something unrelated"
]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents)

# KMeans Clustering (you can adjust the number of clusters as per your requirement)
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:  # Printing top 10 terms per cluster
        print(' %s' % terms[ind]),
    print()

print("\n")
print("Predictions for the documents:")
for i, doc in enumerate(documents):
    print(f"{doc} => Cluster {model.predict(vectorizer.transform([doc]))[0]}")



Top terms per cluster:
Cluster 0:
 topic
 unrelated
 something
 completely
 previous
 one
 different
 similar
 yet
 sentence

Cluster 1:
 sentence
 first
 kind
 yet
 another
 different
 similar
 unrelated
 topic
 something



Predictions for the documents:
this is the first sentence => Cluster 1
this sentence is similar to the first sentence => Cluster 1
this is yet another sentence => Cluster 1
this is a different kind of sentence => Cluster 1
and this is a completely different topic => Cluster 0
but this one is similar to the previous topic => Cluster 0
and here is something unrelated => Cluster 0


**2 ) CLUSTERING - TEXTS + POS TAGs, NER TAGs**

In [None]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import FeatureUnion
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

def pos_tagger(doc):
    return [token.pos_ for token in nlp(doc)]

def ner_tagger(doc):
    return [ent.label_ for ent in nlp(doc).ents]

documents = [
    "this is the first sentence",
    "Barack Obama was a president.",
    "this is yet another sentence about New York",
    "Apple is a different kind of company",
    "and this is a completely different topic related to Microsoft",
    "but this one is similar to the previous topic about Amazon",
    "and here is something unrelated in 2022"
]

# Extract features: POS tags and NER tags
pos_features = [' '.join(pos_tagger(doc)) for doc in documents]
ner_features = [' '.join(ner_tagger(doc)) for doc in documents]

# Vectorize using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))

combined_features = FeatureUnion([
    ('text', tfidf_vectorizer),
    ('pos', TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False)),
    ('ner', TfidfVectorizer(tokenizer=lambda x: x.split(), lowercase=False))
])

X = combined_features.fit_transform(documents + pos_features + ner_features)

# KMeans Clustering
true_k = 3
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

# Display results
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = combined_features.get_feature_names_out()
for i in range(true_k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:  # top 10 terms per cluster
        print(f" {terms[ind]}")
    print()

print("\nPredictions for the documents:")
for i, doc in enumerate(documents):
    print(f"{doc} => Cluster {model.predict(combined_features.transform([doc]))[0]}")

Top terms per cluster:
Cluster 0:
 pos__is
 ner__is
 pos__this
 ner__this
 text__sentence
 ner__DATE
 text__person
 pos__ORDINAL
 pos__GPE
 pos__DATE

Cluster 1:
 ner__NOUN
 pos__NOUN
 text__noun
 pos__PROPN
 text__propn
 ner__PROPN
 ner__DET
 pos__DET
 text__det
 text__aux

Cluster 2:
 ner__ORG
 pos__ORG
 text__org
 ner__yet
 pos__AUX
 pos__DATE
 pos__CCONJ
 pos__Barack
 pos__Apple
 pos__Amazon


Predictions for the documents:
this is the first sentence => Cluster 0
Barack Obama was a president. => Cluster 0
this is yet another sentence about New York => Cluster 0
Apple is a different kind of company => Cluster 0
and this is a completely different topic related to Microsoft => Cluster 0
but this one is similar to the previous topic about Amazon => Cluster 0
and here is something unrelated in 2022 => Cluster 0




**3) LDA - TOPIC MODELING**

In [None]:
!pip install gensim nltk




In [None]:
!pip install pyLDAvis


Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.24.2 (from pyLDAvis)
  Downloading numpy-1.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
Collecting pandas>=2.0.0 (from pyLDAvis)
  Downloading pandas-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting tzdata>=2022.1 (from pandas>=2.0.0->pyLDAvis)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [

In [None]:

import gensim
from gensim import corpora
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

nltk.download('stopwords')
nltk.download('wordnet')

# load and preprocess data
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# 1. Load and Preprocess the Data
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data

# Preprocess the data
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    tokens = tokenizer.tokenize(doc.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    return tokens

processed_docs = [preprocess(doc) for doc in documents]


  and should_run_async(code)


In [None]:

# 2. Prepare Data for LDA
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


  and should_run_async(code)


In [None]:

# 3. Apply LDA

from gensim.models import CoherenceModel

# Define a range of topics you want to explore
topic_range = list(range(3, 21, 5))  #  from 3 to 20 topics, with a step of 5.

# Lists to store the results
models = []
coherences = []

for num_topics in topic_range:
    # Apply LDA
    lda_model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    models.append(lda_model)

    # Evaluate Using Coherence
    coherence_model = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    coherences.append(coherence)

# Print the coherence scores
for num_topics, coherence in zip(topic_range, coherences):
    print(f"Number of Topics: {num_topics}, Coherence Score: {coherence:.4f}")

# Based on the printed scores, choose a model
chosen_model_index = coherences.index(max(coherences))
chosen_model = models[chosen_model_index]
chosen_num_topics = topic_range[chosen_model_index]

print(f"\nBest Model has {chosen_num_topics} topics with a coherence score of {max(coherences):.4f}")

# Inspect Topics for the chosen model
topics = chosen_model.print_topics(num_words=10)
for topic in topics:
    print(topic)



  and should_run_async(code)


Number of Topics: 3, Coherence Score: 0.6001
Number of Topics: 8, Coherence Score: 0.6567
Number of Topics: 13, Coherence Score: 0.6551
Number of Topics: 18, Coherence Score: 0.6286

Best Model has 8 topics with a coherence score of 0.6567
(0, '0.088*"max" + 0.025*"g9v" + 0.017*"b8f" + 0.014*"a86" + 0.011*"1d9" + 0.011*"145" + 0.009*"34u" + 0.008*"bhj" + 0.007*"giz" + 0.007*"75u"')
(1, '0.008*"window" + 0.008*"drive" + 0.007*"system" + 0.006*"one" + 0.006*"use" + 0.006*"problem" + 0.006*"would" + 0.006*"thanks" + 0.006*"card" + 0.006*"know"')
(2, '0.008*"government" + 0.007*"state" + 0.006*"gun" + 0.006*"law" + 0.005*"right" + 0.004*"people" + 0.004*"armenian" + 0.004*"year" + 0.004*"president" + 0.004*"american"')
(3, '0.010*"edu" + 0.010*"space" + 0.005*"com" + 0.004*"nasa" + 0.004*"new" + 0.003*"university" + 0.003*"1993" + 0.003*"center" + 0.003*"satellite" + 0.003*"launch"')
(4, '0.010*"one" + 0.010*"would" + 0.009*"people" + 0.007*"god" + 0.006*"know" + 0.006*"think" + 0.006*"say