#### Imports

In [1]:
import json
import nltk
from nltk.collocations import *
import datetime as dt
import locale
import spacy
from tqdm import tqdm
import pprint
import pandas as pd
from ast import literal_eval
import os
import gensim
import gensim.corpora as corpora
from pprint import pprint
from tqdm import tqdm 


In [2]:
!python -m spacy download de_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.5.0/de_core_news_lg-3.5.0-py3-none-any.whl (567.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.8/567.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-lg
Successfully installed de-core-news-lg-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_lg')


In [3]:
#NOTE: default version doesntwork in colab
!pip install pyLDAvis==2.1.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis==2.1.2
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (setup.py) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97736 sha256=066686b6410d4e4f07a97cc7407c91db9bbaa45f65133b639647ac9c6d345bb8
  Stored in directory: /root/.cache/pip/wheels/d9/93/d6/16c95da19c32f037fd75135ea152d0df37254c25cd1a8b4b6c
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-2.1.2


#### Read Data

In [4]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
#change cwd
%cd drive/MyDrive/Work/Frontline/data
#%cd /content/drive/MyDrive/data/

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [6]:
# paths
FILTERED_PATH="filtered_4_26"

In [10]:
dfs = []

# loop through files 
for filename in tqdm(os.listdir(FILTERED_PATH)):
    # if csv file, load and add to dfs  
    if filename.endswith(".csv"):
        file_path = os.path.join(FILTERED_PATH, filename)
        df = pd.read_csv(file_path, index_col=0, converters={"text":literal_eval})
        dfs.append(df)

# combine files in df
df = pd.concat(dfs, ignore_index=True)

100%|██████████| 207/207 [00:15<00:00, 13.76it/s]


### Topic Modelling


#### Prepare Data


In [11]:
# custom module
import preprocessing

In [12]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [13]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [16]:
smaller_df=df.iloc[:10000,:]

In [18]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(smaller_df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

10000it [05:01, 33.16it/s]


In [19]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 10000/10000 [00:05<00:00, 1974.04it/s]


#### Topic Analysis

In [None]:
import pyLDAvis
import pyLDAvis.gensim
import pickle  

In [20]:
# Create Dictionary
id2word = corpora.Dictionary(spacy_cleaned)

# Create Corpus
texts = spacy_cleaned

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [21]:
# number of topics
num_topics = 5
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id2word, num_topics=num_topics)
# Print the key words for each topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]



[(0,
  '0.009*"gewalt" + 0.008*"frauen" + 0.005*"kinder" + 0.004*"häuslicher" + '
  '0.004*"polizei" + 0.004*"menschen" + 0.004*"frau" + 0.004*"mann" + '
  '0.004*"opfer" + 0.003*"telefon"'),
 (1,
  '0.020*"telefon" + 0.014*"gewalt" + 0.012*"frauen" + 0.006*"häuslicher" + '
  '0.006*"bereitschaftsdienst" + 0.004*"beratung" + 0.004*"prozent" + '
  '0.004*"kinder" + 0.004*"polizei" + 0.004*"montag"'),
 (2,
  '0.030*"telefon" + 0.016*"bereitschaftsdienst" + 0.016*"gewalt" + '
  '0.012*"frauen" + 0.009*"häuslicher" + 0.006*"do" + 0.006*"montag" + '
  '0.006*"jugendtelefon" + 0.006*"beratung" + 0.005*"interventionsstelle"'),
 (3,
  '0.033*"telefon" + 0.017*"mi" + 0.016*"bereitschaftsdienst" + 0.015*"gewalt" '
  '+ 0.014*"do" + 0.008*"frauen" + 0.008*"häuslicher" + 0.006*"montag" + '
  '0.006*"jugendtelefon" + 0.006*"polizei"'),
 (4,
  '0.014*"telefon" + 0.010*"gewalt" + 0.009*"frauen" + 0.004*"frau" + '
  '0.004*"kinder" + 0.004*"menschen" + 0.004*"beratung" + 0.003*"donnerstag" + '
  '0.00

In [22]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

  head(R).drop('saliency', 1)


END OF CODE