In [None]:
!pip install bertopic plotly gensim seaborn transformers datasets torch scikit-learn accelerate


Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━

In [None]:
import torch
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import pandas as pd
import plotly.express as px
from bertopic import BERTopic
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import os


In [None]:
from google.colab import files

# Upload dataset manually in Colab
uploaded = files.upload()
csv_path = list(uploaded.keys())[0]  # Get the uploaded file name

df = pd.read_csv(csv_path)

# Ensure correct data types
df = df.dropna(subset=["body", "subreddit", "split"])
df["subreddit"] = df["subreddit"].astype(str)


Saving final_labels.csv to final_labels (1).csv


In [None]:
# Encode subreddit labels as numeric values
label_encoder = LabelEncoder()
df["subreddit_label"] = label_encoder.fit_transform(df["subreddit"])


In [None]:
# Split dataset based on the 'split' column (train/test)
train_df = df[df["split"] == "train"]
test_df = df[df["split"] == "test"]

# Extract text and labels
train_texts, train_labels = train_df["body"], train_df["subreddit_label"]
test_texts, test_labels = test_df["body"], test_df["subreddit_label"]

# Convert to Hugging Face dataset format
dataset = Dataset.from_pandas(pd.DataFrame({"text": train_texts, "label": train_labels.astype(int)}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_texts, "label": test_labels.astype(int)}))


In [None]:
# Load pretrained DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(df["subreddit_label"].unique())
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.to("cuda" if torch.cuda.is_available() else "cpu")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

dataset = dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/5254 [00:00<?, ? examples/s]

Map:   0%|          | 0/1301 [00:00<?, ? examples/s]

In [None]:
# Compute metrics with weighted loss and accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1-score": f1}


In [None]:
# Training arguments optimized for Colab
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,  # Reduce for Colab memory constraints
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="./logs",
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    fp16=True if torch.cuda.is_available() else False
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1-score
1,3.6305,3.049701,0.202152,0.146669,0.202152,0.136384
2,2.9716,2.735346,0.286703,0.259209,0.286703,0.232646
3,2.5688,2.636422,0.309762,0.293311,0.309762,0.27123
4,1.7501,2.648577,0.320523,0.275366,0.320523,0.28844
5,1.4475,2.74371,0.332821,0.312083,0.332821,0.310206
6,1.2088,2.87911,0.338201,0.320308,0.338201,0.321132
7,0.7919,3.021492,0.32744,0.315808,0.32744,0.315338
8,0.6686,3.169441,0.332052,0.320432,0.332052,0.320418
9,0.5746,3.303297,0.341276,0.337657,0.341276,0.332204
10,0.4397,3.452964,0.338201,0.3327,0.338201,0.327894


TrainOutput(global_step=9855, training_loss=1.1070222000129812, metrics={'train_runtime': 1688.6048, 'train_samples_per_second': 46.672, 'train_steps_per_second': 5.836, 'total_flos': 1.04589320449536e+16, 'train_loss': 1.1070222000129812, 'epoch': 15.0})

In [None]:
# Save model
model.save_pretrained("./final_model")
tokenizer.save_pretrained("./final_model")


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/vocab.txt',
 './final_model/added_tokens.json',
 './final_model/tokenizer.json')

In [None]:
# Topic Modeling using BERTopic
texts = df["body"].tolist()
topic_model = BERTopic()
topics, _ = topic_model.fit_transform(texts)

# Analyze Topic Evolution
topic_evolution = topic_model.visualize_barchart(top_n_topics=10)
topic_evolution.show()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Topic Coherence Score
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(texts)
id2word = Dictionary([text.split() for text in texts])
corpus = [id2word.doc2bow(text.split()) for text in texts]
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=15, passes=20)
coherence_model_lda = CoherenceModel(model=lda_model, texts=[text.split() for text in texts], dictionary=id2word, coherence='c_v')
topic_coherence_score = coherence_model_lda.get_coherence()

# Perplexity Score
perplexity_score = lda_model.log_perplexity(corpus)

print(f"Topic Coherence Score: {topic_coherence_score}")

Topic Coherence Score: 0.5460670470681979
Perplexity Score: -9.534512590084942
