In [1]:
pip install numpy scipy scikit-learn gensim nltk pyLDAvis


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.




[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Step 1: Data Collection
from sklearn.datasets import fetch_20newsgroups

In [3]:
# Fetch the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')
documents = newsgroups.data

In [4]:
# Step 2: Text Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
# Download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocess(text):
    # Remove punctuation, numbers, and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    tokens = text.split()
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [8]:
# Preprocess all documents
preprocessed_docs = [preprocess(doc) for doc in documents]

In [None]:
# Step 3: Topic Modeling with LDA
from gensim import corpora
from gensim.models import LdaModel

# Tokenize preprocessed documents
tokenized_docs = [doc.split() for doc in preprocessed_docs]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(tokenized_docs)

# Filter out extreme values
dictionary.filter_extremes(no_below=15, no_above=0.5)

# Create a document-term matrix
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Apply LDA model
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Print the topics
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}")

# Visualize the topics using pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Prepare visualization
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)

In [13]:
# Step 4: Word Embeddings and Similarity Measurement
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=2, workers=4)

# Get document vectors by averaging word vectors
def document_vector(doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in w2v_model.wv.key_to_index]
    # Average the vectors of all words in the document
    return np.mean(w2v_model.wv[doc], axis=0)

# Compute document vectors
doc_vectors = np.array([document_vector(doc) for doc in tokenized_docs])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(doc_vectors)

# Display similarity between first 5 documents
print(similarity_matrix[:5, :5])

[[1.0000001  0.5529127  0.70223314 0.54379404 0.5820057 ]
 [0.5529127  0.9999999  0.36349177 0.7936087  0.83862364]
 [0.70223314 0.36349177 1.         0.51533675 0.5179916 ]
 [0.54379404 0.7936087  0.51533675 1.0000001  0.8742099 ]
 [0.5820057  0.83862364 0.5179916  0.8742099  0.9999999 ]]


In [15]:
# Extracting the list of words in each topic
topic_words = lda_model.show_topics(num_topics=10, num_words=10, formatted=False)
topics = {}
for topic in topic_words:
    topic_id, words = topic
    topics[topic_id] = [word for word, _ in words]

print("List of Words in Each Topic:")
for topic_id, words in topics.items():
    print(f"Topic {topic_id}: {words}")



List of Words in Each Topic:
Topic 0: ['x', 'file', 'window', 'image', 'program', 'use', 'version', 'application', 'available', 'server']
Topic 1: ['space', 'armenian', 'new', 'turkish', 'center', 'year', 'earth', 'u', 'research', 'may']
Topic 2: ['israel', 'jew', 'israeli', 'arab', 'muslim', 'jewish', 'article', 'right', 'university', 'state']
Topic 3: ['people', 'would', 'one', 'gun', 'dont', 'said', 'u', 'think', 'right', 'know']
Topic 4: ['key', 'government', 'would', 'law', 'use', 'system', 'chip', 'public', 'one', 'u']
Topic 5: ['car', 'article', 'like', 'one', 'get', 'nntppostinghost', 'would', 'dont', 'im', 'good']
Topic 6: ['university', 'nntppostinghost', 'drive', 'system', 'thanks', 'would', 'know', 'email', 'card', 'computer']
Topic 7: ['god', 'one', 'people', 'would', 'say', 'christian', 'think', 'dont', 'know', 'article']
Topic 8: ['would', 'one', 'article', 'use', 'b', 'like', 'dont', 'get', 'time', 'also']
Topic 9: ['game', 'team', 'year', 'player', 'university', 'artic

In [1]:
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Convert LDA topic distribution for each document into a NumPy array
lda_vectors = np.array([lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus])

# Create a matrix of topic probabilities
lda_vectors = np.array([[topic_prob[1] for topic_prob in doc] for doc in lda_vectors])

# Apply t-SNE for dimensionality reduction
tsne_model = TSNE(n_components=2, random_state=42)
lda_tsne = tsne_model.fit_transform(lda_vectors)

# Plot t-SNE results
plt.figure(figsize=(10, 6))
plt.scatter(lda_tsne[:, 0], lda_tsne[:, 1], alpha=0.7, s=60)
plt.title('t-SNE Visualization of Documents (LDA Vectors)')
plt.show()


NameError: name 'corpus' is not defined

In [None]:
# Step 5: Comparison of Document Similarity
# Similarity using LDA vectors
lda_similarity_matrix = cosine_similarity(lda_vectors)

print("Cosine Similarity Matrix using LDA Vectors (first 5 documents):")
print(lda_similarity_matrix[:5, :5])

# Similarity using Word Embeddings
embedding_similarity_matrix = cosine_similarity(doc_vectors)

print("Cosine Similarity Matrix using Word Embeddings (first 5 documents):")
print(embedding_similarity_matrix[:5, :5])