In [None]:
import pandas as pd
import ast
import pickle

# Hierarchical Stochastic Block Model

This is the only notebook that we ran on Google Colab. The rest of the code was run locally in order to comply with GDPR requirements, but since the hSBM is very computing-heavy it was only feasible to run this notebook on Google Colab. 

To mitigate data protection concerns that arise when uploading and processing person-sensitive data (such as tweets and user names) on Google Colab, we took the following precautions: We only uploaded a dataset containing preprocessed, tokenized and lemmatized tweet texts without further metadata (i.e. these datasets did not contain a user handle or tweet creation date). Moreover, all user mentions were removed from the tweet texts during the preprocessing. Since these lists of lemmas do not contain any personally identifiable information, we argue that these datasets are not covered by the data protection rules layed out in the GDPR and that we can therefore safely upload and process them on Google Drive and Google Colab.

Once the hSBM has been fitted, we saved and downloaded the models. All further investigation of results in which we, for example, read the original tweet texts in order to validate the model results, were conducted on our local computers and in accordance with the rules specified by the GDPR and the UCPH data protection guidelines.

In [None]:
# access files in google drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# load dataset
data = '/content/gdrive/MyDrive/Colab Notebooks/de_lemma.csv'
df = pd.read_csv(data)

In [None]:
# function to turn the tokenized list into a readable format
def string_list(text):
    
    # we transform the string representation of the list into an actual list
    text = ast.literal_eval(text)
    
    return text

In [None]:
# apply function to all relevant columns
df['lemma_no_mention'] = df['lemma_no_mention'].apply(string_list)

# display dataframe
print(df.shape)

In [None]:
# install dependencies
!pip install -q condacolab

import condacolab
condacolab.install()

import condacolab

! conda config --add channels conda-forge
! conda config --add channels ostrokach-forge
! conda config --add channels pkgw-forge
 
! conda install gtk3 
! conda install pygobject graph-tool cairo
! conda install -c conda-forge graph-tool 
! git clone https://github.com/martingerlach/hSBM_Topicmodel.git

In [None]:
# import the packages we just installed
import graph_tool.all as gt
from hSBM_Topicmodel.sbmtm import sbmtm

# Run the HSBM model

we ran the models on the three datasets with the following specifications:

* Danish dataset: Since the Danish dataset is so small, the model is run on the full sample. The ``n_min`` paramater was set to 0.
* German dataset: We randomly subsample 20,000 tweets and set the ``n_min`` paramater was set to 2.
* German dataset: We randomly subsample 20,000 tweets and set the ``n_min`` paramater was set to 2.

In [None]:
## the sampling step is only relevant for the Polish and the German dataset

# randomly sample 
sample = df.sample(n=20000, random_state=3)

# turn the 'lemma_no_mention' series into list
texts = sample['lemma_no_mention'].tolist()

# see it if worked
print(len(texts))

In [None]:
# create an instance of the class
model = sbmtm()

# seed for graph-tool's random number generator --> same results
gt.seed_rng(40) 

# create the graph
# for German and Polish, we specify n_min = 5; since the Danish corpus is so small we do not specify it in the Danish case
model.make_graph(texts,documents=['%d'%i for i in range(len(texts))], n_min=5)

In [None]:
# fitting the model
model.fit()

In [None]:
# save the model to disk
filename = 'de_hsbm_sample20_nmin5.sav'
pickle.dump(model, open(filename, 'wb'))