In [1]:
#!pip install octis

In [2]:
import octis
import pandas as pd
import base64
from io import BytesIO

In [3]:
path = '..\\data\\clean\\des_b64_wit_kag_0_of_48.tsv'

df = pd.read_table(path)
images = df['b64_bytes'].apply(lambda x: BytesIO(base64.b64decode(x)))

image_list = images.to_list()

texts = df['context_page_description'].to_list()

In [4]:
from octis.dataset.dataset import Dataset
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

In [5]:
from octis.models.LDA import LDA
model = LDA(num_topics=10)
model_output = model.train_model(dataset)

In [6]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
metric = TopicDiversity(topk=10)
topic_diversity_score = metric.score(model_output)

In [7]:
for text in texts:
    enc = text.encode('utf-8')
    enc.decode()

In [8]:
import locale
enc = locale.getpreferredencoding()
enc

'cp1252'

In [9]:
txt_file = r'..\myfile.txt'
text = "\n".join(texts)
text = text.encode('cp1252', 'ignore').decode('cp1252', 'ignore')
with open(txt_file, mode='wt') as file:
    file.write(text)

with open(txt_file) as f:
    content = f.read()


In [10]:
from octis.preprocessing.preprocessing import Preprocessing
import string
preprocessor = Preprocessing(
                lowercase=False,
                remove_punctuation=False,
                punctuation=string.punctuation,
                remove_numbers=False,
                lemmatize=False,
                language="english",
                split=False,
                verbose=True,
                save_original_indexes=True,
                remove_stopwords_spacy=False,
            )

In [11]:
output_folder = "custom_octis_dataset"
dataset = preprocessor.preprocess_dataset(documents_path=txt_file)
dataset.save(output_folder)

created vocab
18890
words filtering done


In [12]:
vocab = output_folder+'/vocabulary.txt'
with open(vocab, encoding='utf-8') as f:
    content = f.read()
content = content.encode('cp1252', 'ignore').decode('cp1252', 'ignore')
with open(vocab, mode='wt') as file:
    file.write(content)

In [13]:
custom_dataset = Dataset()
custom_dataset.load_custom_dataset_from_folder(output_folder)

In [14]:
model2 = LDA(num_topics=10)  # Create model
model_output2 = model2.train_model(custom_dataset) # Train the model

In [15]:
score = metric.score(model_output2)
score

0.24

# Octis with multimodal

In [16]:
import sys
sys.path.append('C:\\Users\\Martin\\Documents\\GitHub\\master\\modeling')


In [17]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

In [18]:
import multimodal
import c_tf_idf
import multimodalModel

In [46]:
class Trainer:
    def __init__(self, custom_dataset_folder, params, top_k = 10):
        self.model = None
        self.folder = custom_dataset_folder
        self.top_k = top_k
        self.data = self.get_dataset()
        
        self.metrics = self.get_metrics()
        self.params = params
        
        
    def train(self, params):
        self.model = multimodalModel.MultimodalModel(**params)
        topics = self.model.fit_transform()
        all_words = [word for words in self.data.get_corpus() for word in words]
        bertopic_topics = [
            [
                vals if vals in all_words else all_words[0]
                for vals in self.model.get_topic(i)
            ]
            for i in range(len(set(topics)) - 1)
        ]

        output_tm = {"topics": bertopic_topics}
        return output_tm


    def evaluate(self, model_output, verbose=True):
        results = {}
        for scorers,_ in self.metrics:
            for scorer, name in scorers:
                score = scorer.score(model_output)
                results[name]=float(score)
        if verbose:
            for metric, score in results.items():
                print(f'{metric}:{str(score)}')
        return results
        
    
    def get_dataset(self):
        data = Dataset()
        data.load_custom_dataset_from_folder(self.folder)
        return data

    def get_metrics(self):
        npmi = Coherence(texts = self.data.get_corpus(), topk=self.top_k, measure="c_npmi")
        topic_diversity = TopicDiversity(topk=self.top_k)

        coherence= [(npmi, "npmi")]
        diversity = [(topic_diversity, "diversity")]

        metrics = [(coherence, "Coherence"), (diversity, "Diversity")]
        return metrics


    

In [20]:
from sentence_transformers import SentenceTransformer

In [21]:
embedding_model = SentenceTransformer("clip-ViT-B-32")
path = '..\\data\\clean\\des_b64_wit_kag_0_of_48.tsv'
texts, images = multimodal.get_image_and_text_from_file(path)
text_embed = multimodal.get_embeddings_from_text(texts, embedding_model)
image_embed = multimodal.get_embeddings_from_images(images, embedding_model)

100%|██████████| 18/18 [00:11<00:00,  1.56it/s]
100%|██████████| 18/18 [00:17<00:00,  1.03it/s]


In [48]:
params = {'path_to_data':'..\\data\\clean\\des_b64_wit_kag_0_of_48.tsv',
'embedding_model':SentenceTransformer("clip-ViT-B-32"),

'precomputed_text_embeds':text_embed,
'precomputed_image_embeds':image_embed
}
folder = output_folder
trainer = Trainer(custom_dataset_folder = folder,  params = params)



In [49]:
output = trainer.train(trainer.params)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Martin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 16220/16220 [00:01<00:00, 11318.91it/s]
100%|██████████| 16220/16220 [00:00<00:00, 17329.08it/s]
100%|██████████| 16220/16220 [00:00<00:00, 27965.41it/s]
100%|██████████| 16220/16220 [00:06<00:00, 2557.55it/s]
100%|██████████| 16220/16220 [00:00<00:00, 18773.14it/s]
100%|██████████| 16220/16220 [00:00<00:00, 27679.18it/s]
100%|██████████| 16220/16220 [00:00<00:00, 53887.36it/s]
100%|██████████| 16220/16220 [00:01<00:00, 13471.78it/s]
100%|██████████| 16220/16220 [00:00<00:00, 22877.27it/s]
100%|██████████| 16220/16220 [00:01<00:00, 12014.83it/s]
100%|██████████| 16220/16220 [00:00<00:00, 225282.17it/s]


In [50]:
trainer.evaluate(output)

npmi:-0.19851895701409147
diversity:0.7777777777777778


{'npmi': -0.19851895701409147, 'diversity': 0.7777777777777778}

In [56]:
trainer.evaluate(model_output2)

npmi:-0.035532260439734085
diversity:0.24


{'npmi': -0.035532260439734085, 'diversity': 0.24}

In [57]:
model_output2['topics']

[['the', 'of', 'a', 'is', 'in', 'was', 'and', 'The', 'Islands', 'as'],
 ['the', 'and', 'of', 'was', 'in', 'a', 'is', 'to', 'as', 'The'],
 ['of', 'the', 'in', 'is', 'a', 'to', 'was', 'and', 'It', 'The'],
 ['in', 'the', 'is', 'of', 'a', 'The', 'was', 'reserve', 'population', 'It'],
 ['and', 'the', 'is', 'Iowa', 'of', 'in', 'a', 'The', '92', 'Gorj'],
 ['the', 'is', 'a', 'in', 'of', 'and', 'was', 'It', 'as', 'to'],
 ['the', 'of', 'in', 'is', 'a', 'and', 'The', 'was', 'to', 'It'],
 ['the', 'and', 'of', 'in', 'a', 'was', 'is', 'to', 'by', 'as'],
 ['a', 'is', 'the', 'of', 'and', 'in', 'was', 'amateur', 'Cuban', 'light'],
 ['a',
  'is',
  'dwelling',
  'the',
  'in',
  'was',
  'of',
  'and',
  'medalist',
  'Willem']]

In [51]:
output

{'topics': [['part',
   'township',
   'town',
   'area',
   'city',
   'united',
   'states',
   'population',
   'county',
   'census'],
  ['endemic',
   'bird',
   'plant',
   'Scolopendra',
   'found',
   'family',
   'native',
   'genus',
   'species',
   'known'],
  ['service',
   'class',
   'world',
   'states',
   'united',
   'Scolopendra',
   'ship',
   'built',
   'war',
   'navy'],
  ['population',
   'area',
   'church',
   'building',
   'south',
   'district',
   'station',
   'located',
   'city',
   'county'],
  ['division',
   'guard',
   'city',
   'force',
   'district',
   'national',
   'asteroid',
   'order',
   'municipality',
   'army'],
  ['football',
   'league',
   'national',
   'plays',
   'played',
   'basketball',
   'team',
   'professional',
   'player',
   'footballer'],
  ['festival',
   'fought',
   'Scolopendra',
   'Scolopendra',
   'saint',
   'marriage',
   'countess',
   'holy',
   'Scolopendra',
   'la'],
  ['member',
   'albums',
   'music',