# NST Preprocessing Experiment (NPE):
## Top2Vec model from lem+TFIDF dataset

Generating a Topic-to-vector (Top2Vec) model from the lemmatized + TF-IDF-cleaned dataset.

In [1]:
# Imports
import random
import os
import multiprocessing as mp
from top2vec import Top2Vec

from src.utils.file_management import (
    ROOT_PATH,
    load_subtitles,
    write_topics_file)

from src.utils.tables_and_plots import (
    display_n_wordclouds,
    create_wordcloud
)

NST_SAMPLE_SIZE = 1000
EMBEDDING_MODEL = "distiluse"
PIPELINE = "lem_tfidf"

In [2]:
# Load preprocessed CSV file
folder_path_data = r"nst_preprocessing_experiment/preprocessed_data"
file_name_data = f"npe_{NST_SAMPLE_SIZE}_{PIPELINE}"

subtitled_programs = load_subtitles(folder_path_data, file_name_data)

In [3]:
example_program_id = random.choice(list(subtitled_programs.keys()))
example_program_slice = 5
subtitled_programs[example_program_id][:example_program_slice]

['hjertelig', 'velkommen', 'sommeråpent', 'vikingtid', 'kåre']

In [4]:
# Concatenating tokens to long strings/documents
for (program_id, subtitles) in subtitled_programs.items():
    subtitled_programs[program_id] = " ".join(subtitles)

In [5]:
# Creating/loading model from dataset

folder_path_model = r"nst_preprocessing_experiment/models/top2vec"
file_name_model = f"npe_{NST_SAMPLE_SIZE}_{PIPELINE}_top2vec_{EMBEDDING_MODEL}"
file_path = os.path.join(ROOT_PATH, folder_path_model, PIPELINE, file_name_model)

top2vec = None

if os.path.exists(file_path):
    top2vec = Top2Vec.load(file_path)
else:
    if EMBEDDING_MODEL == "distiluse":
        embedding = "distiluse-base-multilingual-cased"
    top2vec = Top2Vec(documents=list(subtitled_programs.values()), embedding_model=embedding, min_count=5, speed="fast-learn", workers=mp.cpu_count() - 2)
    top2vec.save(file_path)

In [6]:
num_topics = top2vec.get_num_topics()
num_topics

7

In [7]:
topic_sizes, _ = top2vec.get_topic_sizes()
topic_sizes

array([328, 326, 162,  62,  43,  40,  38], dtype=int64)

In [8]:
topic_words, word_scores, topic_nums = top2vec.get_topics(num_topics)

In [9]:
topic_word_scores = [dict(zip(topic_words[i][:10], word_scores[i])) for i in topic_nums]

In [10]:
for i in range(5):
    print("\n", i)
    print(topic_word_scores[i])
    


 0
{'oooooh': 0.2431274, 'hahaha': 0.22307909, 'haha': 0.22011882, 'hahahe': 0.2180962, 'ooh': 0.21294086, 'ooo': 0.21280807, 'oh': 0.2107629, 'uff': 0.20840901, 'mmm': 0.20784535, 'aha': 0.20371282}

 1
{'skattepolitikk': 0.15280613, 'fuck': 0.14839816, 'fylkespolitiker': 0.1478839, 'politireforme': 0.14088814, 'oooooh': 0.14087343, 'storpolitikk': 0.13537058, 'politikk': 0.13256164, 'politisk': 0.13180478, 'uff': 0.13116929, 'hahaha': 0.12797157}

 2
{'flekkefjord': 0.17402978, 'hjeltefjorden': 0.17355998, 'oooooh': 0.17256936, 'nordfjord': 0.16683225, 'takke': 0.16014099, 'snø': 0.15832841, 'fjellvegg': 0.15705876, 'oslofjorden': 0.15450689, 'ooo': 0.15227655, 'takknemlig': 0.15129536}

 3
{'fotballag': 0.19771329, 'gullmedalje': 0.19519584, 'fotballkamp': 0.1941242, 'fotball': 0.18953875, 'fotballe': 0.1883981, 'fotballvm': 0.17854145, 'verdenscup': 0.17768171, 'sølvmedalje': 0.17474395, 'fotballbane': 0.17177646, 'bronsemedalje': 0.17167394}

 4
{'russernes': 0.32719892, 'russisk

In [11]:
display_n_wordclouds(topic_word_scores, "Top2Vec: " + PIPELINE, num_topics, dpi=200, width=1600, height=1600)

TypeError: display_n_wordclouds() got an unexpected keyword argument 'width'

In [12]:
folder_path_word_cloud = f"nst_preprocessing_experiment/results/word_clouds/top2vec/{PIPELINE}"

for i in range(num_topics):
    topic_wordcloud = create_wordcloud(topic_word_scores[i], width=1600, height=1600)
    topic_wordcloud.to_file(os.path.join(ROOT_PATH, folder_path_word_cloud, str(file_name_model + f"_{i}.png")))

In [14]:
# Save topic words of model to file

topic_words_numbered = [(i, list(topic_words[i])) for i in range(num_topics)]

folder_path_topics = r"nst_preprocessing_experiment/results/topics/"
file_name_topics = file_name_data

write_topics_file(folder_path_topics, file_name_topics, topic_words_numbered, model="top2vec", top2vec_embedding=EMBEDDING_MODEL)