<a href="https://colab.research.google.com/github/NicolaGabriele/MastodonContentCompliance/blob/main/topic_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install bertopic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [34]:
import os
from bertopic import BERTopic
import json
from tqdm import tqdm

model = BERTopic(language="english")

RESULTS_HOME = '/kaggle/input/instance-json/results/results'
topic_model = BERTopic.load("MaartenGr/BERTopic_Wikipedia")

In [35]:
instances_file_path = '/kaggle/input/instance-json/instances.jsonl'

instances = []

with open(instances_file_path, 'r') as file:
    for line in file:
        instance_data = json.loads(line)
        instances.append(instance_data)

#filtered_instances = [inst for inst in instances if inst["instance"] != "mastodon.social"]
filtered_instances = [inst for inst in instances]

valid_instances = []

for instance in filtered_instances:
    instance_file_path = os.path.join(RESULTS_HOME, f"{instance['instance']}.json")
    if os.path.isfile(instance_file_path):
        valid_instances.append(f"{instance['instance']}.json")
    if len(valid_instances) == 100:
        break

print(valid_instances)

['mastodon.social.json', 'good.news.json', 'pawoo.net.json', 'test.pawoo.org.json', 'baraag.net.json', 'mstdn.social.json', 'pravda.me.json', 'mastodon.online.json', 'mastodon.world.json', 'mas.to.json', 'kids.0px.io.json', 'techhub.social.json', 'universeodon.com.json', 'mastodonapp.uk.json', 'masto.ai.json', 'mastodon.uno.json', 'mastodon.sdf.org.json', 'infosec.exchange.json', 'c.im.json', 'fosstodon.org.json', 'brighteon.social.json', 'dev.brighteon.social.json', 'hachyderm.io.json', 'mastodon.top.json', 'mstdn.party.json', 'social.vivaldi.net.json', 'mastodon.nl.json', 'ohai.social.json', 'fedibird.com.json', 'sfba.social.json', 'mamot.fr.json', 'mastodon.art.json', 'mindly.social.json', 'mast.lat.json', 'tkz.one.json', 'mstdn.ca.json', 'mastodon.gamedev.place.json', 'toot.community.json', 'piaille.fr.json', 'mastodon.scot.json', 'mastodon.xyz.json', 'aethy.com.json', 'mastodon.au.json', 'ravenation.club.json', 'ioc.exchange.json', 'planet.moe.json', 'det.social.json', 'aus.social

In [36]:
def process_batch(posts_batch):
    t, _ = topic_model.transform(posts_batch)
    return [topic_model.topic_labels_[i] for i in t]

def process_batch2(posts_batch):
    t, probs = model.fit_transform(posts_batch)
    #return [topic_model.topic_labels_[i] for i in t]

In [37]:
def process_topics():    
    for instance_name in tqdm(valid_instances, desc="Processing Instance"):
        aligned_count = 0
        total_count = 0
        instance = json.load(open(os.path.join(RESULTS_HOME, instance_name), 'r'))
        posts = [post['text'] for post in tqdm(instance['records'], desc='Pull post' ) if post['language'] == 'en']
        aux = [post['tags'] for post in instance['records'] if post['language'] == 'en']
        tags = []
        for tagl in tqdm(aux, desc='Processing tags'):
            for tag in tagl:
                tags.append(tag['name'])

        #print(f"{instance_name}",tags)

        batch_size = 10000
        batches = [posts[i:i + batch_size] for i in range(0, len(posts), batch_size)]

        topic_path = os.path.join('/kaggle/working', f'fullgas_topics_{instance_name}.txt')

        all_topics = []

        if os.path.exists(topic_path):
            # Se il file esiste, carica i topic dal file
            with open(topic_path, 'r') as f:
                all_topics = [line.strip() for line in f.readlines()]
            print(f'Topics loaded from {topic_path}')
        else:
            # Se il file non esiste, genera i topic e salvali nel file
            for batch in tqdm(batches, desc='Processing batches'):
                topics = process_batch(batch)
                all_topics.extend(topics)

                with open(topic_path, 'a') as f:
                    for topic in topics:
                        f.write(f'{topic}\n')

        if len(tags) > 0:
            total_count += len(tags)
            for tag in tqdm(tags, desc='Checking tag alignment'):
                for topic in all_topics:
                    if tag in topic:
                        aligned_count += 1
                        break
        else:
            total_count = 1
        output_path = os.path.join('/kaggle/working', f'count_results{instance_name}.txt')
        with open(output_path, 'w') as file:
            file.write(f'Aligned Count: {aligned_count}\n')
            file.write(f'Total Count: {total_count}\n')
            file.write(f'Alignment / Total : {aligned_count / total_count:.4f}\n')

        print(aligned_count)
        print(total_count)
        print(aligned_count/total_count)

In [38]:
def fit_topic():    
    #instances = os.listdir(RESULTS_HOME)
    for instance_name in tqdm(valid_instances, desc = "Processing Instance"):
        instance = json.load(open(os.path.join(RESULTS_HOME, instance_name), 'r'))
        posts = [post['text'] for post in tqdm(instance['records'], desc='Pull post' ) if post['language'] == 'en']
        
        batch_size = 10000
        batches = [posts[i:i + batch_size] for i in range(0, len(posts), batch_size)]

        for batch in tqdm(batches, desc='Processing batches'):
            #topics = process_batch(batch)
            process_batch2(batch)

In [None]:
fit_topic()

Processing Instance:   0%|          | 0/100 [00:00<?, ?it/s]
Pull post:   0%|          | 0/400000 [00:00<?, ?it/s][A
Pull post:  25%|██▍       | 99804/400000 [00:00<00:00, 997950.54it/s][A
Pull post: 100%|██████████| 400000/400000 [00:00<00:00, 1346213.77it/s][A

Processing batches:   0%|          | 0/29 [00:00<?, ?it/s][A2024-07-04 11:37:33,691 - BERTopic - Embedding - Transforming documents to embeddings.
2024-07-04 11:37:41,395 - BERTopic - Embedding - Completed ✓
2024-07-04 11:37:41,396 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-07-04 11:38:08,117 - BERTopic - Dimensionality - Completed ✓
2024-07-04 11:38:08,118 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-07-04 11:38:08,535 - BERTopic - Cluster - Completed ✓

Processing batches:   3%|▎         | 1/29 [00:35<16:42, 35.80s/it][A2024-07-04 11:38:09,493 - BERTopic - Embedding - Transforming documents to embeddings.
2024-07-04 11:38:15,632 - BERTopic - Embedding - Compl

In [24]:
model.visualize_topics()

In [None]:
def tot_post():
    c = 0
    for instance_name in tqdm(os.listdir(RESULTS_HOME)):
        instance = json.load(open(os.path.join(RESULTS_HOME, instance_name), 'r'))
        posts = [post['text'] for post in instance['records'] if post['language'] == 'en']
        aux = [post['tags'] for post in instance['records'] if post['language'] == 'en']
        tags = []
        for tagl in aux:
            for tag in tagl:
                tags.append(tag['name'])
        if len(tags)>0:
            c+= len(posts)

    print(c)
#Numero di post totali da analizzare 18387780

In [None]:
#!rm -r /kaggle/working
#!zip -r scores.zip /kaggle/working