# SCOPUS

In [None]:
%%capture
%pip install -U bertopic safetensors flash_attn pandas numpy

## Get the Data

In [None]:
%%capture

!wget -nc https://raw.githubusercontent.com/Songblabla/datasci/main/DSDE_Project-main/clean2023.csv
!wget -nc https://raw.githubusercontent.com/Songblabla/datasci/main/DSDE_Project-main/clean2022.csv
!wget -nc https://raw.githubusercontent.com/Songblabla/datasci/main/DSDE_Project-main/clean2021.csv
!wget -nc https://raw.githubusercontent.com/Songblabla/datasci/main/DSDE_Project-main/clean2020.csv
!wget -nc https://raw.githubusercontent.com/Songblabla/datasci/main/DSDE_Project-main/clean2019.csv
!wget -nc https://raw.githubusercontent.com/Songblabla/datasci/main/DSDE_Project-main/clean2018.csv

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import pandas as pd
import numpy as np

df2023 = pd.read_csv("clean2023.csv")
df2022 = pd.read_csv("clean2022.csv")
df2021 = pd.read_csv("clean2021.csv")
df2020 = pd.read_csv("clean2020.csv")
df2019 = pd.read_csv("clean2019.csv")
df2018 = pd.read_csv("clean2018.csv")

dfs = [df2023, df2022, df2021, df2020, df2019, df2018]
df = pd.concat(dfs)

docs = list(df["Title_Abstract"].astype(str))

## Connect to Hugging Face

## Model Training

In [None]:
from huggingface_hub import notebook_login
notebook_login()

### Medium Alibaba

In [None]:
MODEL_NAME = "BERTOPIC_MEDBLAST"

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)

topic_model = BERTopic(
    embedding_model=embedding_model,
    verbose=True,
)
embeddings = embedding_model.encode(docs, batch_size=32, show_progress_bar=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
freq = topic_model.get_topic_info()
freq.head()

In [None]:
freq.to_csv(f"{MODEL_NAME}.csv")

In [None]:
result = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_})
result.to_csv(f"{MODEL_NAME}-pred.csv")

In [None]:
np.save(f"{MODEL_NAME}-embedded.npy", embeddings)

In [None]:
topic_model.save(f"{MODEL_NAME}", serialization="safetensors", save_ctfidf=True)

### Model Instruct

In [None]:
MODEL_NAME = "BERTOPIC_LANGCAST"

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

task = """
Create a system to summarize and classify documents from the Scopus dataset. Develop a model that can accurately categorize documents into types such as research articles, conference papers, and review papers. Additionally, implement a method to generate concise summaries of the documents to aid in quick comprehension and retrieval of key information.
"""
queries = [
    get_detailed_instruct(task, "Identify distinguishing features for each document type in the Scopus dataset."),
    get_detailed_instruct(task, "Implement a summarization technique to generate brief summaries of the documents."),
]

input_texts = queries + docs

model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

embeddings = model.encode(input_texts, batch_size=4, normalize_embeddings=True, show_progress_bar=True)

topic_model = BERTopic(verbose=True, embedding_model=model)

topics, probs = topic_model.fit_transform(docs, embeddings[len(queries):]) # NO Query

In [None]:
freq = topic_model.get_topic_info()
freq.head()

In [None]:
freq.to_csv(f"{MODEL_NAME}.csv")

In [None]:
result = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_})
result.to_csv(f"{MODEL_NAME}-pred.csv")

In [None]:
np.save(f"{MODEL_NAME}-embedded.npy", embeddings)

### ALL MET BASE

In [None]:
MODEL_NAME = "BERTOPIC_MET_BASE"

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-mpnet-base-v2')

topic_model = BERTopic(
    embedding_model=embedding_model,
    verbose=True,
)
embeddings = embedding_model.encode(docs, show_progress_bar=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
freq = topic_model.get_topic_info()
freq.head()

In [None]:
freq.to_csv(f"{MODEL_NAME}.csv")

In [None]:
result = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_})
result.to_csv(f"{MODEL_NAME}-pred.csv")

In [None]:
np.save(f"{MODEL_NAME}-embedded.npy", embeddings)

### MINI LLM

In [None]:
MODEL_NAME = "BERTOPIC_MINI_RAILS"

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('llmrails/ember-v1')

topic_model = BERTopic(
    embedding_model=embedding_model,
    verbose=True,
)
embeddings = embedding_model.encode(docs, batch_size=4, show_progress_bar=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
freq = topic_model.get_topic_info()
freq.head()

In [None]:
freq.to_csv(f"{MODEL_NAME}.csv")

In [None]:
result = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_})
result.to_csv(f"{MODEL_NAME}-pred.csv")

In [None]:
np.save(f"{MODEL_NAME}-embedded.npy", embeddings)

### ALL MET + SUMMARIZER

* This is the best of the balance results

In [None]:
MODEL_NAME = "BERTOPIC_MET_FLAN"

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large", device_map="auto")

def summarization_model(text):
    input_text = f"Identify unique topics in this Nature document given here: {docs[0]}."
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0])

In [None]:
from bertopic.representation._base import BaseRepresentation
from typing import List, Mapping, Tuple

class SummarizationRepresentation(BaseRepresentation):
    def __init__(self, summarization_model, summarization_tokenizer):
        self.summarization_model = summarization_model
        self.summarization_tokenizer = summarization_tokenizer

    def extract_topics(self, topic_model, documents, c_tf_idf, topics
                      ) -> Mapping[str, List[Tuple[str, float]]]:
        updated_topics = {}
        for topic_id, words in topics.items():
            # Extract only the words from the tuples
            words_only = [word[0] for word in words]
            text = " ".join(words_only)
            summary = summarization_model(text)
            updated_topics[topic_id] = [(summary, 1.0)]
        return updated_topics

summarization = SummarizationRepresentation(summarization_model, tokenizer)

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import numpy as np
from bertopic.representation import PartOfSpeech, KeyBERTInspired, MaximalMarginalRelevance
from transformers import AutoModel, AutoTokenizer

np.random.seed(42)

embedding_model = SentenceTransformer('all-mpnet-base-v2')

representation_models = {
    "KeyBERTInspired": KeyBERTInspired(),
    "Summarization": [KeyBERTInspired(), summarization],
}

topic_model = BERTopic(
    embedding_model=embedding_model,
    representation_model=representation_models,
    verbose=True,
)
embeddings = embedding_model.encode(docs, show_progress_bar=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

In [None]:
freq = topic_model.get_topic_info()
freq.head()

In [None]:
freq.to_csv(f"{MODEL_NAME}.csv")

In [None]:
result = pd.DataFrame({"Document": docs, "Topic": topic_model.topics_})
result.to_csv(f"{MODEL_NAME}-pred.csv")

In [None]:
np.save(f"{MODEL_NAME}-embedded.npy", embeddings)

## Export Model

In [None]:
HUGGING_FACE_USERNAME = "username"
MODEL_NAME = "BERTOPIC_MET_BLAST"

In [None]:
topic_model.push_to_hf_hub(
    repo_id=f"{HUGGING_FACE_USERNAME}/{MODEL_NAME}",
    save_ctfidf=True
)