<a href="https://colab.research.google.com/github/Immortal-sage/hello-world/blob/master/Sugar_cosmetics_social_media_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('punkt')

# Read the descriptions from the file
file_path = '/content/Sugar_cosmetics_followers_Description.csv'  # path of the file
description_column = 'Description'  # the name of the description column

df = pd.read_csv(file_path)
descriptions = df[description_column].tolist()

# Clean and preprocess the descriptions
preprocessed_docs = []
for doc in descriptions:
    tokens = nltk.word_tokenize(doc.lower())
    preprocessed_docs.append(" ".join(tokens))

# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_docs)

# Perform topic modeling using Latent Dirichlet Allocation (LDA)
num_topics = 5  # hyperparameter
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_model.fit(tfidf_matrix)

# Get the top words for each topic
feature_names = tfidf_vectorizer.get_feature_names_out()

def get_top_words(topic_idx, n_top_words):
    return [feature_names[i] for i in lda_model.components_[topic_idx].argsort()[:-n_top_words - 1:-1]]

# Extract the dominant topic for each description
df['dominant_topic'] = df[description_column].apply(lambda doc: lda_model.transform(tfidf_vectorizer.transform([doc])).argmax())

# Print the dominant topics and their top words
for topic_idx in range(num_topics):
    top_words = get_top_words(topic_idx, n_top_words=5)
    print(f"Cluster {topic_idx}: {', '.join(top_words)}")
    print(f"Number of descriptions: {len(df[df['dominant_topic'] == topic_idx])}")
    print("Sample Descriptions:")
    sample_descriptions = df[df['dominant_topic'] == topic_idx][description_column].head()
    for description in sample_descriptions:
        print(f"- {description}")
    print()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cluster 0: xf0, x9f, xd8, xe2, xd9
Number of descriptions: 11376
Sample Descriptions:
- b''
- b''
- b''
- b''
- b"\xe2\x84\x93\xce\xb9\xce\xbd\xce\xb9\xce\xb7g \xd0\xbc\xd1\x83 \xe2\x88\x82\xd1\x8f\xd1\x94\xce\xb1\xd0\xbc\xe2\x9d\xa4\xef\xb8\x8f SuperBoy\xf0\x9f\x90\xac\xf0\x9f\x8c\xb8\xf0\x9f\x92\x98\xce\xb9 \xce\xb1\xd0\xbc self-em \xf0\x9f\x92\x98\xce\xb9 \xc6\x92\xd1\x94\xd1\x94\xe2\x84\x93 \xe2\x84\x93\xce\xb9\xd0\xba\xd1\x94 \xd0\xbc\xd1\x83 \xd1\x8f\xcf\x83\xe2\x84\x93\xd1\x94 \xce\xb9\xce\xb7 \xd0\xbc\xd1\x83 \xc6\x92\xce\xb1\xd0\xbc\xce\xb9\xe2\x84\x93\xd1\x83 \xce\xb9\xd1\x95 \xce\xb9\xd0\xbc\xcf\x81\xcf\x83\xd1\x8f\xd1\x82\xce\xb1\xce\xb7\xd1\x82, \xce\xb9 \xc6\x92\xd1\x94\xd1\x94\xe2\x84\x93 \xe2\x84\x93\xce\xb9\xd0\xba\xd1\x94 \xce\xb9 \xd0\xba\xce\xb9\xce\xb7\xe2\x88\x82 \xcf\x83\xc6\x92 \xcf\x81\xcf\x85\xd1\x82 \xd1\x82\xd0\xbd\xd1\x94 \xc6\x92\xcf\x85\xce\xb7 \xce\xb1\xce\xb7\xe2\x88\x82 \xd7\xa0\xcf\x83\xd1\x83 \xce\xb9\xce\xb7 \xce\xb9'\xd0\xbc \xce\xb7\xcf\x83\xd1\x8