<a href="https://colab.research.google.com/github/MonaFaghfouri/Topic_Modeling/blob/main/Topic_Modeling_Word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas openpyxl gensim scikit-learn matplotlib
!pip install --upgrade --force-reinstall numpy pandas gensim openpyxl

In [None]:
# 📁 1. Upload Excel File
from google.colab import files
uploaded = files.upload()

# 📊 2. Read and Prepare Data
import pandas as pd
import ast

df = pd.read_excel(next(iter(uploaded)))
df['lemmatized_tweet'] = df['Text'].astype(str).apply(lambda x: ' '.join(ast.literal_eval(x)))

# 🧠 3. Import Required Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

# 🔍 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_tweet'])

# 🔁 5. Grid Search to Optimize DBSCAN Parameters
best_score = -1
best_eps = None
best_min_samples = None

print("🔍 Searching for the best amount of eps و min_samples ...")

for eps in np.arange(0.1, 1.2, 0.1):
    for min_samples in range(3, 15):
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(tfidf_matrix)

        mask = labels != -1
        if len(set(labels[mask])) < 2:
            continue

        try:
            score = silhouette_score(tfidf_matrix[mask], labels[mask])
            print(f"eps={eps:.1f}, min_samples={min_samples}, silhouette={score:.4f}")
            if score > best_score:
                best_score = score
                best_eps = eps
                best_min_samples = min_samples
        except:
            continue

# ✅ 6. Final Clustering with Best Parameters
print("\n✅ The best combination was found:")
print(f"Best eps = {best_eps}, Best min_samples = {best_min_samples}, Silhouette Score = {best_score:.4f}")

dbscan_final = DBSCAN(eps=best_eps, min_samples=best_min_samples)
final_labels = dbscan_final.fit_predict(tfidf_matrix)

# 🧼 7. Filter Non-Noise Data
mask = final_labels != -1
filtered_labels = final_labels[mask]
tfidf_dense = tfidf_matrix[mask].toarray()

# 📏 8. Final Evaluation
if len(set(filtered_labels)) > 1:
    silhouette_final = silhouette_score(tfidf_dense, filtered_labels)
    db_index = davies_bouldin_score(tfidf_dense, filtered_labels)
    print(f"\n📈 Final Silhouette Score: {silhouette_final:.4f}")
    print(f"📉 Davies-Bouldin Index: {db_index:.4f}")
else:
    print("⚠️ Clustering was not successful (only noise or a single cluster).")


In [None]:
# 📁 1. Upload Excel File
from google.colab import files
uploaded = files.upload()

# 📊 2. Read and Prepare Data
import pandas as pd
import ast

df = pd.read_excel(next(iter(uploaded)))
df['lemmatized_tweet'] = df['Text'].astype(str).apply(lambda x: ' '.join(ast.literal_eval(x)))

# 🧠 3. Import Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns
import numpy as np

# 🔍 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(df['lemmatized_tweet'])

# 🧩 5. DBSCAN Clustering
dbscan = DBSCAN(eps=0.3, min_samples=3)
labels = dbscan.fit_predict(tfidf_matrix)

# 🧼 6. Filter Non-Noise Data
mask = labels != -1
filtered_df = df[mask].copy()
filtered_df['cluster'] = labels[mask]

# 🧠 7. Extract Top Words Per Cluster
cluster_tokens = {}
for cluster_id in sorted(filtered_df['cluster'].unique()):
    cluster_texts = filtered_df[filtered_df['cluster'] == cluster_id]['lemmatized_tweet']
    all_tokens = ' '.join(cluster_texts).split()
    common_words = [word for word, _ in Counter(all_tokens).most_common(5)]
    cluster_tokens[cluster_id] = common_words

# 🖨️ 8. Print Topics
print("\n🧠 Best Topics (with eps=0.3):")
for cluster_id, words in cluster_tokens.items():
    word_str = '، '.join(words)
    print(f"Topic {cluster_id}: {word_str}")

# 📉 9. DBSCAN Cluster Visualization
print("\n📊 Generating Cluster Visualization...")

pca = PCA(n_components=2)
reduced_data = pca.fit_transform(tfidf_matrix.toarray())

plt.figure(figsize=(7, 6))
plt.scatter(reduced_data[mask, 0], reduced_data[mask, 1], c=labels[mask], cmap='viridis', s=15)
plt.title("DBSCAN Cluster Visualization")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.savefig("dbscan_clusters.png", dpi=300)
plt.close()

# 📊 10. Cluster Size Distribution (Readable)
plt.figure(figsize=(14, 6))  # بزرگ‌تر شدن تصویر
cluster_counts = pd.Series(labels[mask]).value_counts().sort_index()
sns.barplot(x=cluster_counts.index, y=cluster_counts.values, width=0.7)

plt.xlabel("Cluster", fontsize=12)
plt.ylabel("Number of Data Points", fontsize=12)
plt.title("Cluster Size Distribution", fontsize=14)
plt.xticks(rotation=90, fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.grid(axis='y')
plt.savefig("cluster_sizes_readable.png", dpi=300)
plt.close()

# 📏 Silhouette Score
if len(set(labels[mask])) > 1:
    silhouette = silhouette_score(tfidf_matrix[mask], labels[mask])
    print(f"\n📈 Silhouette Score (excluding noise): {silhouette:.4f}")
else:
    print("\n⚠️ Not enough clusters to compute Silhouette Score.")

# 📥 11. Download Saved Figures
files.download("dbscan_clusters.png")
files.download("cluster_sizes_readable.png")


In [None]:
# 1. Upload files
from google.colab import files
uploaded = files.upload()

# 2. Libraries
import pandas as pd
import ast
import re
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np

# 3. Identify filtered word list file and text file
file_names = list(uploaded.keys())
sample_words_file = [f for f in file_names if 'word' in f.lower()][0]
text_file = [f for f in file_names if f != sample_words_file][0]

# 4. Word cleaning function
def clean_word(word):
    return re.sub(r'[^\w\s]', '', word.strip().lower())

# 5. Read and clean filtered words
df_words = pd.read_excel(sample_words_file)
economic_words = df_words.iloc[:, 0].dropna().astype(str).apply(clean_word).tolist()
economic_words = set(economic_words)

# 6. Read and process text file
df_texts = pd.read_excel(text_file)
texts_raw = df_texts.iloc[:, 1]

texts = []
for val in texts_raw:
    try:
        parsed = ast.literal_eval(str(val))
        if isinstance(parsed, list):
            cleaned = [clean_word(w) for w in parsed if clean_word(w) in economic_words]
            if cleaned:
                texts.append(cleaned)
    except:
        continue

print(f"📊 Number of documents containing filtered words: {len(texts)}")
if not texts:
    raise ValueError("❌ No documents contain the filtered words.")

# 7. Train Word2Vec model
print("🧠 Training Word2Vec model...")
w2v_model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=2, workers=4, sg=1, seed=42)
word_vectors = w2v_model.wv

print(f"🔢 Vocabulary size: {len(word_vectors.index_to_key)}")

# 8. Extract word vectors for clustering
valid_words = list(word_vectors.index_to_key)
X = np.array([word_vectors[word] for word in valid_words])

# 9. Cluster word vectors
n_clusters = 10  # You can adjust this
print(f"🔍 Clustering words into {n_clusters} topics...")
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)

# 10. Show top words in each cluster
print("\n🧠 Word clusters based on Word2Vec similarity:")
clustered_words = {i: [] for i in range(n_clusters)}
for word, label in zip(valid_words, labels):
    clustered_words[label].append(word)

for cluster_id, words in clustered_words.items():
    print(f"Cluster {cluster_id}: {', '.join(words[:15])}")  # show top 15 words per cluster
