## Loading data

In [65]:
import re
import pandas as pd

# Prepare data
path_pkl = os.path.join("data", "test_df.pkl")
data = pd.read_pickle(path_pkl) 

print(len(data))
data.dropna(inplace=True)
print(len(data))

#data1 = data[:5000] 
#data2 = data[416537:]

#print(len(data1))
#print(len(data2))

#data = pd.concat([data1, data2])

print(len(data))

data = data[data["language"] == "ru"]

print(len(data))

data = data[data["confidence"] > 0.9]

print(len(data))

# make text column lower case
data.document = data['document'].map(lambda document: document.lower() if isinstance(document,str) else document)

# make lists
dates = data.date.to_list()
docs = data.document.to_list()
subreddits = data.sub_reddit.to_list()
language = data.language.to_list()
type = data.type.to_list()

421537
421537
421537
1713
1713


In [66]:
data = data[data["language"] == "ru"]

len(data)


1713

In [67]:
data

Unnamed: 0,document,sub_reddit,date,type,language,confidence
12,песня посвящается боевым гусям украины,ukraina,2022-03-23,submission,ru,0.992579
18,кто что знает или слышал про эту контору?,ukraina,2022-03-23,submission,ru,0.995329
20,"здравствуй редит, мне 12 лет я живу в украине,...",ukraina,2022-03-23,submission,ru,0.994000
33,"#9: восток, центр, запад. хроники вторжения ро...",ukraina,2022-03-23,submission,ru,0.990433
45,#8: харьков. хроники вторжения россии в украин...,ukraina,2022-03-23,submission,ru,0.996689
...,...,...,...,...,...,...
449235,die russian soldiers,ukraina,2022-03-07,comment,ru,0.665647
449236,die russian warships!,ukraina,2022-03-07,comment,ru,0.457248
449238,die russian tanks!!!!,ukraina,2022-03-07,comment,ru,0.560113
449264,""" вежливые люди""",ukraina,2022-03-07,comment,ru,0.995561


### Checking if it works

In [68]:
pd.DataFrame.head(data)

data[:10]
data[9990:]

Unnamed: 0,document,sub_reddit,date,type,language,confidence


In [69]:
print(dates[-1])
print(docs[-1])

2022-03-07
so do your part and you can text a random russian right here


you can copy this text and send it to a random russian

уважаемые россияне, ваши сми подвергаются цензуре. кремль лжет. тысячи ваших солдат и украинских братьев гибнут на украине. узнайте правду в свободном интернете и в приложении telegram. время свергнуть диктатора путина!

which translates to:

dear russians, your media is being censored. the kremlin is lying. thousands of your soldiers and ukrainian brothers are dying in ukraine. find out the truth on the free web and on the telegram app. time to overthrow dictator putin!


In [70]:
#If this outputs anything, there is a problem. Something that should be a string is not...
for i in docs: 
    test = isinstance(i, str)
    if test == False: 
        print (test)

## Training embeddings

In [71]:
import random
random.seed(29)

#Husk at installe sentence_transformers via terminalen
from sentence_transformers import SentenceTransformer

# Prepare embeddings
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=False)


## Fitting the model

In [72]:
#Kør kun hvis chunken nedenfor ikke virker
from umap import UMAP
umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, metric='cosine', random_state = 29)

In [73]:
#Skal virke for at få GPU'en til at køre hurtigere, men der er problemer med at loade cuml pakken

#from cuml.manifold import UMAP
#from cuml.cluster import HDBSCAN

# Create instances of GPU-accelerated UMAP and HDBSCAN
#umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0, metric='cosine', random_state = 29)
#hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)

In [74]:


from bertopic import BERTopic

#Kør hvis GPU opsætning IKKE virker
topic_model = BERTopic(language = "multilingual", verbose = True, calculate_probabilities = False, umap_model=umap_model)
#Kør hvis GPU opsætning virker
#topic_model = BERTopic(language = "multilingual", verbose = True, calculate_probabilities = False, umap_model=umap_model, hdbscan_model=hdbscan_model)


topics, probs = topic_model.fit_transform(docs, embeddings)


2022-12-16 12:52:17,615 - BERTopic - Reduced dimensionality
2022-12-16 12:52:17,666 - BERTopic - Clustered reduced embeddings


### Removing stopwords 

In [75]:
topic_model.get_topic_info()[0:11]

Unnamed: 0,Topic,Count,Name
0,-1,160,-1_it_the_not_of
1,0,1015,0_не_что_на_слава
2,1,180,1_площадь_не_что_на
3,2,82,2_it_not_that_lol
4,3,80,3_fuck_russia_putin_russian
5,4,32,4_he_she_hot_so
6,5,29,5_stabbot_twitter_stabbot_crop_dustin1776
7,6,26,6_missile_peaceful_russian_thermobaric
8,7,24,7_is_the_media_tweet
9,8,20,8_the_of_russian_9861


In [76]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")
#vectorizer_model = CountVectorizer(stop_words="english")

topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

### Investigating the model

In [77]:
topic_model.get_topic_info()[0:11]

Unnamed: 0,Topic,Count,Name
0,-1,160,-1_don_hell_russian_context
1,0,1015,0_не_что_на_это
2,1,180,1_площадь_не_что_на
3,2,82,2_lol_don_good_news
4,3,80,3_fuck_russia_putin_fuck russia
5,4,32,4_hot_hot babe_babe_smiling
6,5,29,5_stabbot_stabbot stabbot_stabbot stabbot stab...
7,6,26,6_missile_russian_peaceful_kyiv podil
8,7,24,7_media_tweet_russian_soldier
9,8,20,8_russian_9861_рф_federation


### Saving and loading models

In [78]:
#topic_model.save("models/model_medium")

In [79]:
#from bertopic import BERTopic

#topic_model = BERTopic.load("models/model_medium")

## Visualizations

In [80]:
#Tager lang tid at køre, når vi har så mange små topics
#topic_model.visualize_topics()

### Topics over time

### Function for visualizing topics over time

In [81]:
import pandas as pd
from typing import List
import plotly.graph_objects as go
from sklearn.preprocessing import normalize


def visualize_topics_over_time(topic_model,
                               topics_over_time: pd.DataFrame,
                               top_n_topics: int = None,
                               topics: List[int] = None,
                               normalize_frequency: bool = False,
                               custom_labels: bool = False,
                               width: int = 1250,
                               height: int = 450) -> go.Figure:
    """ Visualize topics over time
    Arguments:
        topic_model: A fitted BERTopic instance.
        topics_over_time: The topics you would like to be visualized with the
                          corresponding topic representation
        top_n_topics: To visualize the most frequent topics instead of all
        topics: Select which topics you would like to be visualized
        normalize_frequency: Whether to normalize each topic's frequency individually
        custom_labels: Whether to use custom topic labels that were defined using 
                       `topic_model.set_topic_labels`.
        width: The width of the figure.
        height: The height of the figure.
    Returns:
        A plotly.graph_objects.Figure including all traces
    Examples:
    To visualize the topics over time, simply run:
    ```python
    topics_over_time = topic_model.topics_over_time(docs, timestamps)
    topic_model.visualize_topics_over_time(topics_over_time)
    ```
    Or if you want to save the resulting figure:
    ```python
    fig = topic_model.visualize_topics_over_time(topics_over_time)
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/trump.html"
    style="width:1000px; height: 680px; border: 0px;""></iframe>
    """
    colors = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000']

    # Select topics based on top_n and topics args
    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        selected_topics = list(topics)
    elif top_n_topics is not None:
        selected_topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        selected_topics = sorted(freq_df.Topic.to_list())

    # Prepare data
    if topic_model.custom_labels_ is not None and custom_labels:
        topic_names = {key: topic_model.custom_labels_[key + topic_model._outliers] for key, _ in topic_model.topic_labels_.items()}
    else:
        topic_names = {key: value[:40] + "..." if len(value) > 40 else value
                       for key, value in topic_model.topic_labels_.items()}
    topics_over_time["Name"] = topics_over_time.Topic.map(topic_names)
    data = topics_over_time.loc[topics_over_time.Topic.isin(selected_topics), :].sort_values(["Topic", "Timestamp"])

    # Add traces
    fig = go.Figure()
    for index, topic in enumerate(data.Topic.unique()):
        trace_data = data.loc[data.Topic == topic, :]
        topic_name = trace_data.Name.values[0]
        words = trace_data.Words.values
        if normalize_frequency:
            y = normalize(trace_data.Frequency.values.reshape(1, -1))[0]
        else:
            y = trace_data.Frequency
        fig.add_trace(go.Scatter(x=trace_data.Timestamp, y=y,
                                 mode='lines',
                                 marker_color=colors[index % 20],
                                 hoverinfo="text",
                                 name=topic_name,
                                 hovertext=[f'<b>Topic {topic}</b><br>Words: {word}' for word in words]))

    # Styling of the visualization
    fig.update_xaxes(showgrid=True)
    fig.update_yaxes(showgrid=True)
    fig.update_layout(
        yaxis_title="Normalized Frequency" if normalize_frequency else "Frequency",
        title={
            'text': "<b>Topics over Time",
            'y': .95,
            'x': 0.40,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        template="simple_white",
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
        legend=dict(
            title="<b>Global Topic Representation",
        )
    )
    return fig

In [82]:
# Topics over time
topics_over_time = topic_model.topics_over_time(docs, dates, nr_bins=20)
#topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

visualize_topics_over_time(topic_model, topics_over_time, top_n_topics=20)




# Visualize topics over time with the updated colors
#visualize_topics_over_time(model, topics_over_time)

5it [00:00,  6.76it/s]


## Topics pr class

In [83]:
# Topics per class
#topics_per_class = topic_model.topics_per_class(docs, classes = subreddits)

language = data.language.to_list()
type = data.type.to_list()

topics_per_class = topic_model.topics_per_class(docs, classes = language)




1it [00:00,  7.99it/s]


In [84]:
topic_model.visualize_topics_per_class(topics_per_class, top_n_topics=10)
