In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
import random
from bertopic import BERTopic
from umap import UMAP


In [2]:
## Global settings
import projpath
import numpy as np
import random
import os

#PROJPATH = "/mount/arbeitsdaten14/projekte/sfb-732/d8/share//Demographics/"bb
PROJPATH = "../../"

ORG_DATA = PROJPATH + "/original-data"
INT_DATA = PROJPATH + "/intermediate-data"

## This run settings
## 
DATASET = "pan13" #can be replaced with pan13/blog
LANG = "en" # can be replaced with "es"/ "de"/"fr"/"it"/ "nl"
SEED = 272419
CLASSIFIER="lr"

##
# To make the run deterministic
np.random.seed(SEED)
random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [3]:
import torch
import datasets

train_file="%s/%s/%s/train/%s.train.clean.select.jsonlist" % (INT_DATA, DATASET, LANG, LANG)
test_file="%s/%s/%s/test/%s.test.clean.jsonlist" % (INT_DATA, DATASET, LANG, LANG)

dataset =  datasets.load_dataset('json', data_files={"train" : train_file,
                                                     "test" : test_file })

# concatenating texts from the same author
for corp in dataset.keys():
    dataset[corp] = dataset[corp].add_column(name="text", column=[ "\n".join(x) for x in dataset[corp]["texts"] ])

# removing unnecesary
dataset = dataset.remove_columns("texts")

# preparing labels
dataset = dataset.rename_column("gender", 'label')

dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'age', 'id', 'text'],
        num_rows: 75900
    })
    test: Dataset({
        features: ['label', 'age', 'id', 'text'],
        num_rows: 25359
    })
})

In [4]:
label2id =  dict([ (val, i) for (i, val) in enumerate(set(dataset["train"]["label"])) ])
id2label = dict([ (i, v) for (v, i) in label2id.items() ])

def label_to_id(row):
    row["label"] = label2id[row["label"]]
    return row 

train_data = dataset["train"]#.map(label_to_id)
test_data = dataset["test"]#.map(label_to_id)

In [5]:
td = pd.DataFrame(train_data)

In [6]:
td.head()

Unnamed: 0,label,age,id,text
0,male,10s,b971b9f2ffb0b03f73db08b29af41bf5,"Of course, you may be asking yourself this iss..."
1,male,10s,55800a6d716981d91db59604e420cd59,Do you run a small business that involves the ...
2,male,10s,3ca677690dd114dcb53d62d6daeada4c,For the past several months news reports happe...
3,male,10s,8aaccf631fe2ffcf2df291fa4b0ee3f8,The ordinary human being overpays by at least ...
4,male,10s,626ee40d833706488fc2ea859857c718,It truly is undeniable that rest room is virtu...


In [7]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = embedding_model.encode(td['text'].to_list(), show_progress_bar=True)


Batches:   0%|          | 0/2372 [00:00<?, ?it/s]

In [8]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)


In [9]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))


In [11]:
from bertopic.representation import KeyBERTInspired
# KeyBERT
keybert_model = KeyBERTInspired()

representation_model = {
    "KeyBERT": keybert_model
}


In [12]:
from bertopic import BERTopic

topic_model = BERTopic(
  verbose=True,
  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,
  calculate_probabilities=True,
  nr_topics = 100,
  # Hyperparameters
  top_n_words=10
)

In [13]:
# Train model
topics, probs = topic_model.fit_transform(td['text'].to_list(), embeddings)


2024-03-14 22:21:07,752 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-14 22:22:22,037 - BERTopic - Dimensionality - Completed ✓
2024-03-14 22:22:22,043 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before

In [15]:
# Assuming topic_model.get_topic_info() returns a list of dictionaries
topic_info = topic_model.get_topic_info()

In [16]:
topic_info

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,27682,-1_link_time_just_make,"[link, time, just, make, like, people, need, u...","[marketing, business, products, company, servi...",[There is a correct way along with a incorrect...
1,0,4171,0_web_site_marketing_website,"[web, site, marketing, website, business, inte...","[internet marketing, seo, web hosting, adverti...",[As an alternative to outsourcing your online ...
2,1,2780,1_hi_im_friends_love,"[hi, im, friends, love, chat, hello, com, pan,...","[hi, hello, hey, dear, wanna chat, meet, chat,...","[HI WHERE R U ALL????? ;, hi there everyone, h..."
3,2,2543,2_shoes_handbags_bags_boots,"[shoes, handbags, bags, boots, vuitton, nike, ...","[louis vuitton, handbags, handbag, purses, pur...",[Louis Vuitton handbags are the most used bran...
4,3,2112,3_love_god_life_jesus,"[love, god, life, jesus, heart, christ, lord, ...","[salvation, faith, bible, prayer, spirit, sin,...",[Contemplative Spirituality: Dancing With Demo...
...,...,...,...,...,...,...
95,94,56,94_bird_parrot_birds_parrots,"[bird, parrot, birds, parrots, seeds, feeders,...","[parrot diet, bird food, feeding birds, bird s...",[This actually is so taking into consideration...
96,95,55,95_granite_countertops_marble_countertop,"[granite, countertops, marble, countertop, sto...","[granite countertops, countertops granite, gra...",[To guarantee that your granite counter tops a...
97,96,55,96_que_en_la_por,"[que, en, la, por, el, es, para, se, estoy, tu]","[cuando, en el, que se, como, es, yahoo es, en...",[clave zero está durante Marketing ni en Perso...
98,97,54,97_funeral_memorial_cremation_casket,"[funeral, memorial, cremation, casket, decease...","[funeral plans, funeral plan, funeral service,...",[Funeral plans are a great way to prepare for ...


In [23]:
topics = pd.DataFrame(topics)

In [24]:
topics.columns = ["topics"]

In [25]:
topics['id'] = td['id']

In [216]:
probs_df=pd.DataFrame(probs)
probs_df['main percentage'] = pd.DataFrame({'max': probs_df.max(axis=1)})

In [19]:
# Assuming topic_model.get_topic_info() returns a list of dictionaries
topic_info = topic_model.get_topic_info()

# Convert the list of dictionaries to a DataFrame
df_topic = pd.DataFrame(topic_info)


In [29]:
df_topic.to_csv('topic_en_age_balance.csv')

In [14]:
topics_per_class = topic_model.topics_per_class(td['text'].to_list(), classes=td['label'].to_list())

2it [01:17, 38.96s/it]


In [14]:
topics_per_class = topic_model.topics_per_class(td['text'].to_list(), classes=td['age'].to_list())

3it [01:20, 26.91s/it]


In [16]:
td['gender_age'] = td['label'] + '_' + td['age']

In [18]:
topics_per_class = topic_model.topics_per_class(td['text'].to_list(), classes=td['gender_age'].to_list())

6it [01:32, 15.48s/it]


In [16]:

title = f"Top Frequent Topics by Gender (PAN13-EN)"
tv = topic_model.visualize_topics_per_class(topics_per_class, title=title)
tv.write_html("en_topic_per_class.html")

In [15]:

title = f"Top Frequent Topics by Age (PAN13-EN)"
tv = topic_model.visualize_topics_per_class(topics_per_class, title=title)
tv.write_html("age-en_topic_per_class.html")

In [19]:

title = f"Top Frequent Topics by Gender-Age (PAN13-EN)"
tv = topic_model.visualize_topics_per_class(topics_per_class, title=title)
tv.write_html("g_a-en_topic_per_class.html")