In [5]:
import pandas as pd

# read the data
df = pd.read_csv("Data/instagramcommentdataset.csv")

# see the data
print(df.head())
print(df.columns)


                                                Text    Sentiment  \
0   Just finished an amazing workout! ðŸ’ª       ...   Positive     
1   Trying out a new recipe for dinner tonight.  ...   Neutral      
2   The new movie release is a must-watch!       ...   Positive     
3   Just published a new blog post. Check it out!...   Positive     
4   New year, new fitness goals! ðŸ’ª            ...   Positive     

          Timestamp            User     Platform  \
0  15-01-2023 15:45   FitnessFan      Instagram    
1  15-01-2023 19:55   ChefCook        Instagram    
2  16-01-2023 19:30   MovieBuff       Instagram    
3  17-01-2023 15:15   BloggerX        Instagram    
4  18-01-2023 18:00   FitJourney      Instagram    

                                     Hashtags  Retweets  Likes       Country  \
0   #Fitness #Workout                                20     40   USA           
1   #Cooking #Food                                   12     25    Australia    
2    #MovieNight #MustWatch     

In [6]:
from transformers import pipeline

sentiment_model = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment"  # یا "distilbert-base-uncased-finetuned-sst-2-english"
)


Device set to use cpu


In [7]:
sentiment_model("I really love this phone, it is amazing!")


[{'label': '5 stars', 'score': 0.9496434330940247}]

In [8]:
import math

texts = df['Text'].astype(str).tolist()
batch_size = 32

labels = []

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    preds = sentiment_model(batch)
    labels.extend(preds)

# تبدیل خروجی مدل به دو ستون: label و score
df['bert_sentiment_label'] = [x['label'] for x in labels]
df['bert_sentiment_score'] = [x['score'] for x in labels]

# نگاهی به چند سطر
df[['Text', 'bert_sentiment_label', 'bert_sentiment_score']].head()


Unnamed: 0,Text,bert_sentiment_label,bert_sentiment_score
0,Just finished an amazing workout! ðŸ’ª ...,5 stars,0.859115
1,Trying out a new recipe for dinner tonight. ...,4 stars,0.360886
2,The new movie release is a must-watch! ...,5 stars,0.861503
3,Just published a new blog post. Check it out!...,1 star,0.301508
4,"New year, new fitness goals! ðŸ’ª ...",5 stars,0.780261


In [9]:
def stars_to_polarity(label):
    # label مثل "1 star" یا "5 stars"
    n = int(label.split()[0])
    if n <= 2:
        return "negative"
    elif n == 3:
        return "neutral"
    else:
        return "positive"

df['bert_sentiment_polarity'] = df['bert_sentiment_label'].apply(stars_to_polarity)


In [10]:
df['bert_sentiment_polarity'].value_counts()


bert_sentiment_polarity
positive    216
negative     35
neutral       7
Name: count, dtype: int64

we want to find the best hashtag for ech comments

In [11]:
df['Hashtags'] = df['Hashtags'].fillna('').astype(str)
df['Hashtags_lower'] = df['Hashtags'].str.lower()


In [12]:
def filter_by_hashtag(df, hashtag):
    hashtag = hashtag.lower()
    mask = df['Hashtags_lower'].str.contains(rf'\b{hashtag}\b', na=False)
    return df[mask]

df_newyear = filter_by_hashtag(df, '#newyear')
print(df_newyear.head())


Empty DataFrame
Columns: [Text, Sentiment, Timestamp, User, Platform, Hashtags, Retweets, Likes, Country, Year, Month, Day, Hour, bert_sentiment_label, bert_sentiment_score, bert_sentiment_polarity, Hashtags_lower]
Index: []


In [13]:
print(df.head())

                                                Text    Sentiment  \
0   Just finished an amazing workout! ðŸ’ª       ...   Positive     
1   Trying out a new recipe for dinner tonight.  ...   Neutral      
2   The new movie release is a must-watch!       ...   Positive     
3   Just published a new blog post. Check it out!...   Positive     
4   New year, new fitness goals! ðŸ’ª            ...   Positive     

          Timestamp            User     Platform  \
0  15-01-2023 15:45   FitnessFan      Instagram    
1  15-01-2023 19:55   ChefCook        Instagram    
2  16-01-2023 19:30   MovieBuff       Instagram    
3  17-01-2023 15:15   BloggerX        Instagram    
4  18-01-2023 18:00   FitJourney      Instagram    

                                     Hashtags  Retweets  Likes       Country  \
0   #Fitness #Workout                                20     40   USA           
1   #Cooking #Food                                   12     25    Australia    
2    #MovieNight #MustWatch     

In [14]:
print("Sample Hashtags values:")
print(df['Hashtags'].head(20))

print("\nLowercased version:")
print(df['Hashtags_lower'].head(20))


Sample Hashtags values:
0          #Fitness #Workout                        
1          #Cooking #Food                           
2           #MovieNight #MustWatch                  
3           #Blogging #NewPost                      
4           #NewYear #FitnessGoals                  
5           #PetAdoption #FurryFriend               
6           #WinterBlues #Mood                      
7             #Productivity #WorkFromHome           
8            #Brunch #Friends                       
9           #Reading #QuietTime                     
10          #RoadTrip #ScenicViews                  
11          #Inspiration #Workshop                  
12            #Music #ConcertNight                  
13          #Gaming #Tournament                     
14             #Accomplished #Success               
15           #BookRelease #FavoriteAuthor           
16           #Cooking #SpecialDinner                
17           #BikeRide #ScenicTrails                
18     #Fear #Thriller

In [15]:
df_newyear = df[df['Hashtags'].str.contains('newyear', case=False, na=False)]
print(df_newyear[['Hashtags', 'Text']].head())


                                    Hashtags  \
4   #NewYear #FitnessGoals                     

                                                Text  
4   New year, new fitness goals! ðŸ’ª            ...  


In [16]:
def filter_by_hashtag(df, hashtag):
    # حذف # از اول برای انعطاف بیشتر: 'newyear' یا '#newyear' هر دو اوکی
    tag = hashtag.lower().lstrip('#')
    
    # مطمئن می‌شویم ستون Hashtags_lower را داریم
    if 'Hashtags_lower' not in df.columns:
        df['Hashtags'] = df['Hashtags'].fillna('').astype(str)
        df['Hashtags_lower'] = df['Hashtags'].str.lower()
    
    # همین که متن شامل newyear باشد کافی است
    mask = df['Hashtags_lower'].str.contains(tag, na=False)
    return df[mask]


In [17]:
df_newyear = filter_by_hashtag(df, '#newyear')
print(df_newyear[['Hashtags', 'Text']].head())


                                    Hashtags  \
4   #NewYear #FitnessGoals                     

                                                Text  
4   New year, new fitness goals! ðŸ’ª            ...  


In [18]:
df_newyear = filter_by_hashtag(df, 'newyear')

print(df_newyear[['Hashtags', 'Text']].head())

                                    Hashtags  \
4   #NewYear #FitnessGoals                     

                                                Text  
4   New year, new fitness goals! ðŸ’ª            ...  


find the sentiment for specific hashtags

In [19]:
df_newyear['bert_sentiment_polarity'].value_counts(normalize=True)


bert_sentiment_polarity
positive    1.0
Name: proportion, dtype: float64

which countries have this hashtag

In [20]:
df_newyear['Country'].value_counts().head(10)


Country
USA            1
Name: count, dtype: int64

when this hashtag was seen

In [21]:
df_newyear.groupby(['Year', 'Month']).size()


Year  Month
2023  1        1
dtype: int64

In [22]:
from keybert import KeyBERT

kw_model = KeyBERT(model='all-MiniLM-L6-v2')  # از sentence-transformers


In [23]:
text = "The camera quality of this phone is amazing, especially in low light."
keywords = kw_model.extract_keywords(
    text,
    keyphrase_ngram_range=(1, 2),  # تک‌کلمه‌ای و دوکلمه‌ای
    stop_words='english',
    top_n=5
)

print(keywords)


[('camera quality', 0.7083), ('quality phone', 0.6794), ('phone amazing', 0.5526), ('camera', 0.4626), ('low light', 0.4621)]


In [24]:
import re

def text_to_hashtags(text, top_n=5):
    if not isinstance(text, str) or text.strip() == '':
        return []

    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 2),
        stop_words='english',
        top_n=top_n
    )

    tags = []
    for kw, score in keywords:
        # حذف کاراکترهای غیرحرفی/عددی
        kw_clean = re.sub(r'[^a-zA-Z0-9\s]', '', kw)
        if not kw_clean:
            continue
        # حذف فاصله و تبدیل به هشتگ
        tag = '#' + kw_clean.replace(' ', '').lower()
        tags.append(tag)

    # یکتا کردن ترتیب حفظ شود
    tags_unique = list(dict.fromkeys(tags))
    return tags_unique


In [25]:
text_to_hashtags("The camera quality of this phone is amazing, especially in low light.")
# مثال خروجی: ['#cameraquality', '#lowlight', '#phone']


['#cameraquality', '#qualityphone', '#phoneamazing', '#camera', '#lowlight']

In [26]:
df['suggested_hashtags_list'] = df['Text'].apply(text_to_hashtags)

# اگر بخوای به صورت رشته‌ی قابل‌خواندن داشته باشی:
df['suggested_hashtags'] = df['suggested_hashtags_list'].apply(lambda x: ' '.join(x))

df[['Text', 'suggested_hashtags']].head()


Unnamed: 0,Text,suggested_hashtags
0,Just finished an amazing workout! ðŸ’ª ...,#workout #amazingworkout #finishedamazing
1,Trying out a new recipe for dinner tonight. ...,#newrecipe #recipedinner #recipe #dinnertonigh...
2,The new movie release is a must-watch! ...,#newmovie #movierelease #releasewatch #movie #...
3,Just published a new blog post. Check it out!...,#newblog #blogpost #justpublished #blog #publi...
4,"New year, new fitness goals! ðŸ’ª ...",#newfitness #fitnessgoals #fitness #goals #new...


In [27]:
import itertools

def split_hashtags(s):
    if not isinstance(s, str):
        return []
    # جدا کردن بر اساس فاصله و کاما
    parts = re.split(r'[\s,]+', s)
    # فقط مواردی که با # شروع می‌شوند
    return [p.strip().lower() for p in parts if p.strip().startswith('#')]

all_tags = list(itertools.chain.from_iterable(df['Hashtags'].apply(split_hashtags)))
unique_tags = sorted(set(all_tags))

print("Number of unique hashtags:", len(unique_tags))


Number of unique hashtags: 411


In [28]:
from collections import Counter

counter = Counter(all_tags)
candidate_tags = [tag for tag, c in counter.most_common(5)]  # مثلاً ۲۰۰۰ هشتگ برتر


In [29]:
from sentence_transformers import SentenceTransformer
import numpy as np

st_model = SentenceTransformer('all-MiniLM-L6-v2')

# ۱) امبدینگ هشتگ‌ها (فقط یک‌بار)
hashtag_embeddings = st_model.encode(candidate_tags, normalize_embeddings=True)
hashtag_embeddings = np.array(hashtag_embeddings)


In [30]:
def recommend_hashtags_from_vocab(text, top_n=5):
    if not isinstance(text, str) or text.strip() == '':
        return []

    text_emb = st_model.encode([text], normalize_embeddings=True)[0]  # (dim,)
    # شباهت کسینوسی با ضرب داخلی چون نرمال کردیم
    sims = hashtag_embeddings @ text_emb  # shape: (num_tags,)

    # ایندکس‌های top_n
    top_idx = np.argsort(-sims)[:top_n]
    return [candidate_tags[i] for i in top_idx]


In [31]:
recommend_hashtags_from_vocab("The battery life of this phone is amazing", top_n=5)
# خروجی: مثلاً ['#battery', '#smartphone', '#iphone', ...] (بسته به داده‌هایت)


['#enthusiasm', '#curiosity', '#serenity', '#euphoria', '#confusion']

In [32]:
df['suggested_hashtags_vocab'] = df['Text'].apply(
    lambda x: ' '.join(recommend_hashtags_from_vocab(x, top_n=5))
)

df[['Text', 'suggested_hashtags_vocab']].head()


Unnamed: 0,Text,suggested_hashtags_vocab
0,Just finished an amazing workout! ðŸ’ª ...,#enthusiasm #curiosity #euphoria #confusion #s...
1,Trying out a new recipe for dinner tonight. ...,#confusion #enthusiasm #curiosity #euphoria #s...
2,The new movie release is a must-watch! ...,#serenity #confusion #enthusiasm #curiosity #e...
3,Just published a new blog post. Check it out!...,#serenity #euphoria #curiosity #enthusiasm #co...
4,"New year, new fitness goals! ðŸ’ª ...",#enthusiasm #curiosity #euphoria #confusion #s...


In [33]:
# حذف ردیف‌هایی که متن ندارند
df_topic = df[df['Text'].notna()].copy()

# لیست متن‌ها برای ورودی BERTopic
docs = df_topic['Text'].astype(str).tolist()
len(docs)


258

In [34]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# مدل امبدینگ (سریع و خوب برای انگلیسی)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic(
    embedding_model=embedding_model,   # می‌تونی None هم بگذاری و از پیش‌فرض استفاده کنی
    language="english",
    verbose=True
)

topics, probs = topic_model.fit_transform(docs)


2025-11-13 11:58:17,454 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2025-11-13 11:58:21,628 - BERTopic - Embedding - Completed ✓
2025-11-13 11:58:21,632 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-13 11:58:40,958 - BERTopic - Dimensionality - Completed ✓
2025-11-13 11:58:40,962 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-13 11:58:40,989 - BERTopic - Cluster - Completed ✓
2025-11-13 11:58:40,999 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-13 11:58:41,043 - BERTopic - Representation - Completed ✓


In [None]:
df_topic['topic'] = topics


In [36]:
df = df.merge(
    df_topic[['Text', 'topic']],
    on='Text',
    how='left'
)


معمولاً ستون‌ها چیزی مثل این‌اند:

Topic → شماره‌ی topic (مثلاً 0،1،2،...)

Count → چندتا داکیومنت توی این topic

Name → چند کلمه‌ی نماینده‌ی اون topic

In [37]:
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))


   Topic  Count                     Name  \
0     -1     39         -1_the_of_in_and   
1      0    110          0_the_of_new_in   
2      1     59       1_of_the_in_echoes   
3      2     38           2_the_of_by_in   
4      3     12  3_concert_at_of_harmony   

                                      Representation  \
0  [the, of, in, and, as, for, to, with, joy, trail]   
1    [the, of, new, in, for, and, to, with, on, art]   
2  [of, the, in, echoes, to, emotions, feeling, e...   
3  [the, of, by, in, sunset, colors, nature, canv...   
4  [concert, at, of, harmony, resonates, tribute,...   

                                 Representative_Docs  
0  [As the movie credits roll, the viewer experie...  
1  [In the realm of fashion, the designer unveils...  
2  [ Wrapped in the cloak of emotional numbness, ...  
3  [ Embraced by the hopeful dawn, a gardener sow...  
4  [Swaying to the reggae vibes of Bob Marley's t...  


In [38]:
topic_model.get_topic(3)


[('concert', 0.1537633934929388),
 ('at', 0.10331566222196363),
 ('of', 0.09756196233852563),
 ('harmony', 0.09645065456472295),
 ('resonates', 0.09645065456472295),
 ('tribute', 0.09645065456472295),
 ('to', 0.09435150245531224),
 ('soul', 0.08408239805879363),
 ('with', 0.0696581886298484),
 ('musicians', 0.06913769513386436)]

In [39]:
df_topic['topic'].value_counts()


topic
 0    110
 1     59
-1     39
 2     38
 3     12
Name: count, dtype: int64

In [40]:
df_topic['topic'].value_counts().sort_index()


topic
-1     39
 0    110
 1     59
 2     38
 3     12
Name: count, dtype: int64

In [41]:
likes_by_topic = df_topic.groupby('topic')['Likes'].mean().sort_values(ascending=False)
print(likes_by_topic.head(10))


topic
 3    55.833333
 2    49.263158
 0    47.736364
-1    44.769231
 1    35.491525
Name: Likes, dtype: float64


In [42]:
retweets_by_topic = df_topic.groupby('topic')['Retweets'].mean().sort_values(ascending=False)
print(retweets_by_topic.head(10))


topic
 3    28.000000
 2    24.605263
 0    23.936364
-1    22.538462
 1    17.762712
Name: Retweets, dtype: float64


In [43]:
topic_model.get_topic(topic_id)


NameError: name 'topic_id' is not defined

In [44]:
sent_by_topic = (
    df_topic
    .groupby(['topic', 'bert_sentiment_polarity'])
    .size()
    .unstack(fill_value=0)
)

print(sent_by_topic.head(10))


bert_sentiment_polarity  negative  neutral  positive
topic                                               
-1                              6        1        32
 0                              9        0       101
 1                             19        5        35
 2                              1        0        37
 3                              0        1        11


In [45]:
sent_by_topic_pct = sent_by_topic.div(sent_by_topic.sum(axis=1), axis=0)
print(sent_by_topic_pct.head(10))


bert_sentiment_polarity  negative   neutral  positive
topic                                                
-1                       0.153846  0.025641  0.820513
 0                       0.081818  0.000000  0.918182
 1                       0.322034  0.084746  0.593220
 2                       0.026316  0.000000  0.973684
 3                       0.000000  0.083333  0.916667


relationship between topics and hashtags

In [46]:
import re
import itertools

def split_hashtags(s):
    if not isinstance(s, str):
        return []
    parts = re.split(r'[\s,]+', s.strip())
    return [p for p in parts if p.startswith('#') and p != '#']

# ساخت یک نسخه‌ی explode شده
df_tags = df_topic.copy()
df_tags['hashtag_list'] = df_tags['Hashtags'].apply(split_hashtags)

df_exploded = df_tags.explode('hashtag_list')


In [47]:
top_tags_per_topic = (
    df_exploded
    .groupby(['topic', 'hashtag_list'])
    .size()
    .reset_index(name='count')
)

# مثلاً top 10 هشتگ برای topic شماره 3:
topic_id = 3
top_tags_topic3 = (
    top_tags_per_topic[top_tags_per_topic['topic'] == topic_id]
    .sort_values('count', ascending=False)
    .head(10)
)
print(top_tags_topic3)


     topic     hashtag_list  count
422      3         #Harmony      2
429      3    #MusicalUnity      2
415      3    #AdeleConcert      1
427      3           #Music      1
435      3       #TeenMusic      1
434      3  #SoulUpliftment      1
433      3    #SeniorVoices      1
432      3    #QueenTribute      1
431      3      #Positivity      1
430      3       #Nostalgia      1


In [48]:
# ============================================
#   FULL TOPIC ANALYSIS REPORT (ONE-BLOCK)
# ============================================

import pandas as pd
import re
from collections import Counter

# -----------------------------
# 1) استخراج کلمات کلیدی هر Topic
# -----------------------------

topic_keywords = {}
for topic_id in topic_model.get_topic_info()['Topic'].tolist():
    if topic_id == -1:
        continue  # حذف outlierها
    words = topic_model.get_topic(topic_id)
    if words is None:
        continue
    topic_keywords[topic_id] = ", ".join([w[0] for w in words[:5]])  # ۵ کلمه کلیدی برتر

# -----------------------------
# 2) میانگین لایک و ریتوییت
# -----------------------------

likes_by_topic = df_topic.groupby('topic')['Likes'].mean()
retweets_by_topic = df_topic.groupby('topic')['Retweets'].mean()

# -----------------------------
# 3) احساس غالب هر Topic
# -----------------------------

dominant_sentiment = (
    df_topic
    .groupby(['topic', 'bert_sentiment_polarity'])
    .size()
    .reset_index(name='count')
)

# تبدیل به احساس غالب
sent_dict = {}
for topic_id in dominant_sentiment['topic'].unique():
    temp = dominant_sentiment[dominant_sentiment['topic'] == topic_id]
    top_row = temp.sort_values('count', ascending=False).iloc[0]
    sent_dict[topic_id] = top_row['bert_sentiment_polarity']

# -----------------------------
# 4) پرتکرارترین هشتگ‌های هر Topic
# -----------------------------

# تبدیل Hashtags به لیست
def split_hashtags(s):
    if not isinstance(s, str):
        return []
    parts = re.split(r'[\s,]+', s.strip())
    return [p for p in parts if p.startswith('#') and len(p) > 1]

df_topic['hashtag_list'] = df_topic['Hashtags'].apply(split_hashtags)

# explode برای شمارش راحت‌تر
df_hash = df_topic.explode('hashtag_list')

# شمارش
top_hashtags = (
    df_hash.groupby(['topic','hashtag_list'])
    .size()
    .reset_index(name='count')
)

top3_tags_per_topic = {}
for topic_id in df_topic['topic'].unique():
    temp = top_hashtags[top_hashtags['topic'] == topic_id]
    if len(temp) == 0:
        top3_tags_per_topic[topic_id] = ""
    else:
        top3 = temp.sort_values('count', ascending=False)['hashtag_list'].head(3)
        top3_tags_per_topic[topic_id] = " ".join(top3.tolist())

# -----------------------------
# 5) ساخت جدول نهایی
# -----------------------------

report = []
for topic_id in sorted(df_topic['topic'].unique()):
    if topic_id == -1:
        continue
    report.append({
        "Topic": topic_id,
        "Top Keywords": topic_keywords.get(topic_id, ""),
        "Avg Likes": round(likes_by_topic.get(topic_id, 0), 2),
        "Avg Retweets": round(retweets_by_topic.get(topic_id, 0), 2),
        "Dominant Sentiment": sent_dict.get(topic_id, ""),
        "Top 3 Hashtags": top3_tags_per_topic.get(topic_id, "")
    })

report_df = pd.DataFrame(report)
report_df = report_df.sort_values("Avg Likes", ascending=False)  # مرتب‌سازی به دلخواه

print("\n===== FINAL TOPIC REPORT =====")
print(report_df)




===== FINAL TOPIC REPORT =====
   Topic                         Top Keywords  Avg Likes  Avg Retweets  \
3      3  concert, at, of, harmony, resonates      55.83         28.00   
2      2              the, of, by, in, sunset      49.26         24.61   
0      0                the, of, new, in, for      47.74         23.94   
1      1              of, the, in, echoes, to      35.49         17.76   

  Dominant Sentiment                        Top 3 Hashtags  
3           positive  #Harmony #MusicalUnity #AdeleConcert  
2           positive            #Serenity #Hopeful #Wonder  
0           positive          #Excitement #Surprise #Pride  
1           positive       #Confusion #Despair #Bitterness  


In [49]:
# ====================================================
#     AUTO TITLE GENERATOR FOR BERTopic Topics
# ====================================================

import pandas as pd

# 1) گرفتن کلمات کلیدی هر Topic
topic_info = topic_model.get_topic_info()
topic_ids = topic_info[topic_info['Topic'] != -1]['Topic'].tolist()

topic_keywords = {}
for topic_id in topic_ids:
    words = topic_model.get_topic(topic_id)
    if words is None:
        continue
    # فقط 3 تا 5 کلمه برتر
    top_words = [w[0] for w in words[:4]]
    topic_keywords[topic_id] = top_words

# 2) تولید عنوان انسانی با ترکیب کلمات
def make_title(words):
    if not words:
        return "Unknown Topic"
    # اگر فقط یک کلمه
    if len(words) == 1:
        return words[0].capitalize()
    # اگر دو کلمه
    if len(words) == 2:
        return f"{words[0].capitalize()} & {words[1].capitalize()}"
    # اگر چند کلمه → دو کلمه‌ی اصلی + یک توضیح
    return f"{words[0].capitalize()} / {words[1].capitalize()} ({words[2].capitalize()})"

topic_titles = {tid: make_title(words) for tid, words in topic_keywords.items()}

# 3) محاسبه میانگین لایک و ریتوییت
likes_by_topic = df_topic.groupby('topic')['Likes'].mean().round(2)
retweets_by_topic = df_topic.groupby('topic')['Retweets'].mean().round(2)

# 4) احساس غالب
sentiment_df = (
    df_topic
    .groupby(['topic', 'bert_sentiment_polarity'])
    .size()
    .reset_index(name='count')
)

dominant_sentiment = {}
for tid in topic_ids:
    temp = sentiment_df[sentiment_df['topic'] == tid]
    if len(temp) == 0:
        dominant_sentiment[tid] = "Unknown"
    else:
        dominant_sentiment[tid] = (
            temp.sort_values('count', ascending=False).iloc[0]['bert_sentiment_polarity']
        )

# 5) پرتکرارترین هشتگ‌های هر Topic
import re

def split_hashtags(s):
    if not isinstance(s, str):
        return []
    parts = re.split(r'[\s,]+', s.strip())
    return [p for p in parts if p.startswith('#')]

df_topic['hashtag_list'] = df_topic['Hashtags'].apply(split_hashtags)
df_hash = df_topic.explode('hashtag_list')

top_tags = (
    df_hash.groupby(['topic','hashtag_list'])
    .size()
    .reset_index(name='count')
)

top_3_tags = {}
for tid in topic_ids:
    temp = top_tags[top_tags['topic'] == tid]
    if len(temp) == 0:
        top_3_tags[tid] = ""
    else:
        tags = temp.sort_values('count', ascending=False)['hashtag_list'].head(3)
        top_3_tags[tid] = " ".join(tags.tolist())

# 6) ساخت جدول نهایی
report = []
for tid in topic_ids:
    report.append({
        "Topic ID": tid,
        "Auto Title": topic_titles.get(tid, ""),
        "Keywords": ", ".join(topic_keywords.get(tid, [])),
        "Avg Likes": likes_by_topic.get(tid, 0),
        "Avg Retweets": retweets_by_topic.get(tid, 0),
        "Dominant Sentiment": dominant_sentiment.get(tid, ""),
        "Top Hashtags": top_3_tags.get(tid, "")
    })

topic_report = pd.DataFrame(report)
topic_report = topic_report.sort_values("Avg Likes", ascending=False)

print("\n====== AUTO-GENERATED TOPIC REPORT ======")
print(topic_report)



   Topic ID         Auto Title                  Keywords  Avg Likes  \
3         3  Concert / At (Of)  concert, at, of, harmony      55.83   
2         2      The / Of (By)           the, of, by, in      49.26   
0         0     The / Of (New)          the, of, new, in      47.74   
1         1      Of / The (In)       of, the, in, echoes      35.49   

   Avg Retweets Dominant Sentiment                          Top Hashtags  
3         28.00           positive  #Harmony #MusicalUnity #AdeleConcert  
2         24.61           positive            #Serenity #Hopeful #Wonder  
0         23.94           positive          #Excitement #Surprise #Pride  
1         17.76           positive       #Confusion #Despair #Bitterness  


In [50]:
# =======================================================
# BETTER AUTO TITLES FOR TOPICS (Stopwords Removed)
# =======================================================

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk import pos_tag

# اگر دانلود نکردی:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

def clean_keywords(words):
    cleaned = []
    for w in words:
        w_clean = re.sub(r'[^a-zA-Z]', '', w).lower()
        if len(w_clean) < 3: continue
        if w_clean in STOPWORDS: continue
        cleaned.append(w_clean)
    return cleaned

def select_main_words(words):
    # POS tagging
    tagged = pos_tag(words)
    # فقط noun و verb مهم
    filtered = [w for w, tag in tagged if tag.startswith('NN') or tag.startswith('VB')]
    return filtered if filtered else words

def make_better_title(words):
    if not words:
        return "Unknown Topic"
    words = clean_keywords(words)
    words = select_main_words(words)
    if not words:
        return "Unnamed Topic"

    if len(words) == 1:
        return words[0].capitalize()
    if len(words) == 2:
        return f"{words[0].capitalize()} & {words[1].capitalize()}"
    return f"{words[0].capitalize()} {words[1].capitalize()}"

# ------------------------------
# استخراج عناوین جدید
# ------------------------------

topic_titles_better = {}

for topic_id in topic_info['Topic'].tolist():
    if topic_id == -1:
        continue
    words_raw = topic_model.get_topic(topic_id)
    if words_raw is None:
        continue
    top_words = [w[0] for w in words_raw[:10]]
    topic_titles_better[topic_id] = make_better_title(top_words)

# نمایش نتایج
for tid, title in topic_titles_better.items():
    print(f"Topic {tid}: {title}")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nastaran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nastaran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nastaran\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Topic 0: Art
Topic 1: Echoes Emotions
Topic 2: Sunset Colors
Topic 3: Concert Harmony


In [51]:
print(df['Hashtags'].head())


0     #Fitness #Workout                        
1     #Cooking #Food                           
2      #MovieNight #MustWatch                  
3      #Blogging #NewPost                      
4      #NewYear #FitnessGoals                  
Name: Hashtags, dtype: object
