<a target="_blank" href="https://colab.research.google.com/github/MohamedTababi-GitH/Social-Media-Analysis-Tool-Predicto/blob/feature/topic-modeling/topics_clustering.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
!pip install ctransformers[cuda] transformers sentence-transformers hdbscan emoji bertopic spacy nltk torch



In [2]:
!pip install --upgrade git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-f61bz6c0
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-f61bz6c0
  Resolved https://github.com/huggingface/transformers to commit baa3b22137d9d47097bd5a17736c0639ecf38e5b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
!huggingface-cli download TheBloke/zephyr-7B-alpha-GGUF zephyr-7b-alpha.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False

zephyr-7b-alpha.Q4_K_M.gguf


In [7]:
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Union
import pandas as pd
import logging
import re
from tqdm import tqdm
from pathlib import Path


# Topics related
from bertopic import BERTopic
from bertopic.representation import TextGeneration
from transformers import AutoTokenizer, pipeline
from ctransformers import AutoModelForCausalLM
import torch
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

# Text preprocessing
from nltk.corpus import stopwords
import nltk
import spacy
import torch
import emoji

# Database connection
import sqlite3

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
tqdm.pandas()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
@dataclass
class ModelConfig:
    embedding_model: str
    min_topic_size: int
    nr_topics: int
    min_cluster_size: int
    representation_model: Optional[TextGeneration] = None
    calculate_probabilities: bool = True
    verbose: bool = True

class TextPreprocessor:
    def __init__(self, batch_size: int = 1000):
        self.nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
        self.stop_words = set(stopwords.words('english'))
        self.batch_size = batch_size

    def clean_text(self, text: str) -> str:
        if not isinstance(text, str):
            return ""
        text = emoji.demojize(text)
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)
        text = re.sub(r"@\w+|#\w+", "", text)
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        return text.lower().strip()

    def tokenize_and_lemmatize(self, text: str) -> str:
        tokens = nltk.word_tokenize(text)
        tokens = [word for word in tokens if word not in self.stop_words]
        doc = self.nlp(" ".join(tokens))
        return " ".join([token.lemma_ for token in doc
                        if not token.is_punct and not token.is_space])

    def process_batch(self, texts: List[str]) -> List[str]:
        return [self.tokenize_and_lemmatize(self.clean_text(text))
                for text in texts]

    def process_series(self, series: pd.Series) -> pd.Series:
        batches = [series[i:i + self.batch_size]
                  for i in range(0, len(series), self.batch_size)]
        processed = []
        for batch in tqdm(batches, desc="Processing text"):
            processed.extend(self.process_batch(batch))
        return pd.Series(processed, index=series.index)


class TopicModelDatabase:
    def __init__(self, db_path: Path):
        self.db_path = db_path

    def connect(self):
        return sqlite3.connect(self.db_path)

    def fetch_comments(self, table_name: str) -> pd.DataFrame:
        with self.connect() as conn:
            return pd.read_sql_query(
                f"SELECT id, comment FROM {table_name}", conn)

    def update_topics(self, df: pd.DataFrame, table_name: str):
        with self.connect() as conn:
            temp_table = f"temp_{table_name}"
            df[['id', 'detected_topic']].to_sql(
                temp_table, conn, if_exists='replace', index=False)

            conn.execute(f"""
                UPDATE {table_name}
                SET detected_topic = (
                    SELECT detected_topic
                    FROM {temp_table}
                    WHERE {table_name}.id = {temp_table}.id
                )
                WHERE EXISTS (
                    SELECT 1
                    FROM {temp_table}
                    WHERE {table_name}.id = {temp_table}.id
                );
            """)
            conn.execute(f"DROP TABLE {temp_table}")


class TopicAnalyzer:
    def __init__(self, config: ModelConfig):
        self.config = config
        self.preprocessor = TextPreprocessor()
        self.logger = self._setup_logger()

        self.topic_model = BERTopic(
            embedding_model=config.embedding_model,
            min_topic_size=config.min_topic_size,
            nr_topics=config.nr_topics,
            hdbscan_model=HDBSCAN(
                min_cluster_size=config.min_cluster_size,
                metric='euclidean',
                cluster_selection_method='eom',
                prediction_data=True
            ),
            representation_model=config.representation_model,
            verbose=config.verbose,
            calculate_probabilities=config.calculate_probabilities
        )

    def _setup_logger(self):
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)
        if not logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter(
                '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
            )
            handler.setFormatter(formatter)
            logger.addHandler(handler)
        return logger

    def analyze(self, texts: Union[List[str], pd.Series]) -> Dict[str, Any]:
        try:
            if isinstance(texts, pd.Series):
                self.logger.info("Processing pandas Series")
                processed_texts = self.preprocessor.process_series(texts)
            else:
                self.logger.info("Processing list of texts")
                processed_texts = self.preprocessor.process_batch(texts)

            self.logger.info("Fitting topic model")
            topics, probs = self.topic_model.fit_transform(processed_texts)

            topic_info = self.topic_model.get_topic_info()
            self.logger.info(f"Found {len(topic_info)} topics")

            return {
                'topics': topics,
                'probabilities': probs,
                'topic_info': topic_info,
                'topic_model': self.topic_model,
                'processed_texts': processed_texts
            }

        except Exception as e:
            self.logger.error(f"Error in topic analysis: {str(e)}")
            raise

    def save_model(self, path: str):
        self.topic_model.save(path)

    def load_model(self, path: str):
        self.topic_model = BERTopic.load(path)

In [9]:
def setup_topic_analyzer(model_name: str = "TheBloke/zephyr-7B-alpha-GGUF",
                        prompt_template: str = None) -> TopicAnalyzer:
    if prompt_template is None:
        prompt_template = "<|system|>You are a helpful assistant for labeling topics.</s>\n" \
                         "<|user|>\nTopic documents:\n[DOCUMENTS]\n" \
                         "Keywords: '[KEYWORDS]'\n" \
                         "Create a short topic label.</s><|assistant|>"

    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        model_file="zephyr-7b-alpha.Q4_K_M.gguf",
        model_type="mistral",
        gpu_layers=50,
        hf=True
    )

    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")

    generator = pipeline(
        model=model,
        tokenizer=tokenizer,
        task='text-generation',
        max_new_tokens=50,
        repetition_penalty=1.2,
        device=device
    )

    representation_model = TextGeneration(generator, prompt=prompt_template)

    config = ModelConfig(
        embedding_model='all-mpnet-base-v2',
        min_topic_size=250,
        nr_topics=30,
        min_cluster_size=300,
        representation_model=representation_model
    )

    return TopicAnalyzer(config)

In [10]:
topic_analyzer = setup_topic_analyzer()

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

CTransformersModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
filename = 'drive/MyDrive/Colab Notebooks/bigdata.csv'
df = pd.read_csv(filename)
df = df.sample(50000)#, random_state=42)
df = df[~df['comment'].isin(['[deleted]', '[removed]'])]
df = df.reset_index(drop=True)

In [13]:
output = topic_analyzer.analyze(df["comment"])

2024-12-04 16:55:39,498 - __main__ - INFO - Processing pandas Series
INFO:__main__:Processing pandas Series
Processing text: 100%|██████████| 50/50 [03:05<00:00,  3.72s/it]
2024-12-04 16:58:45,383 - __main__ - INFO - Fitting topic model
INFO:__main__:Fitting topic model
2024-12-04 16:58:45,401 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

2024-12-04 17:00:09,249 - BERTopic - Embedding - Completed ✓
2024-12-04 17:00:09,261 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-04 17:01:29,105 - BERTopic - Dimensionality - Completed ✓
2024-12-04 17:01:29,108 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-04 17:01:52,783 - BERTopic - Cluster - Completed ✓
2024-12-04 17:01:52,785 - BERTopic - Representation - Extracting topics from clusters using representation models.
 83%|████████▎ | 10/12 [04:07<01:13, 36.75s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 12/12 [05:17<00:00, 26.47s/it]
2024-12-04 17:07:11,280 - BERTopic - Representation - Completed ✓
2024-12-04 17:07:11,285 - BERTopic - Topic reduction - Reducing number of topics
2024-12-04 17:07:11,290 - BERTopic - Topic reduction - Reduced number of topics from 12 to 12
2024-12-04 17:07:11,490 - __main__ - INFO - Found 12 topics
INFO:_

In [17]:
output['topic_info']

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,25990,-1_\n1>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>...,[\n1>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n>\n...,[think hair color do not cause cancer seem lik...
1,0,5571,"0_\n""Delicious Food Recipes and Ideas""___","[\n""Delicious Food Recipes and Ideas"", , , , ,...","[look delicious, look delicious, look nice lik..."
2,1,442,"1_\n""Topics with Exceptional Appearance and Pr...","[\n""Topics with Exceptional Appearance and Pra...","[look good, look good, look good]"
3,2,633,"2_\n""Rfoodporn Belongs to FacesavoringFood Com...","[\n""Rfoodporn Belongs to FacesavoringFood Comm...","[rfoodporn, rfoodporn, belong rfoodporn opinio..."
4,3,1564,"3_\n""Negative Reddit Posts with Upvotes and Re...","[\n""Negative Reddit Posts with Upvotes and Rep...",[I m proud see post minute post likely go stra...
5,4,4244,"4_\n""Sports Fan Opinions on Player Performance...","[\n""Sports Fan Opinions on Player Performance ...",[take anything away mj think less sort ball fa...
6,5,3713,"5_\n""Gaming Habits and Preferences Over Time""___","[\n""Gaming Habits and Preferences Over Time"", ...",[kid grow I m go to money buy game want ill ne...
7,6,438,"6_\n""Sustainable Plastics and Renewable Energy...","[\n""Sustainable Plastics and Renewable Energy ...",[problem be not cannabis indoor farm energy so...
8,7,458,"7_\n""Smoking Cannabis and Its Legalization: A ...","[\n""Smoking Cannabis and Its Legalization: A L...",[week pregnant daily user prior get pregnant s...
9,8,769,8_\nTitle for each for the summary of list of ...,[\nTitle for each for the summary of list of w...,[actually never rid virus find tissue long inf...
