# Analysis of topics that can be extracted from documents Top2Vec

### Imports

In [1]:
#!pip install top2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting top2vec
  Downloading top2vec-1.0.27-py3-none-any.whl (25 kB)
Collecting umap-learn>=0.5.1
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 4.4 MB/s 
Collecting gensim>=4.0.0
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 74.1 MB/s 
[?25hCollecting hdbscan>=0.8.27
  Downloading hdbscan-0.8.29.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 16.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.8.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 54.4 MB/s 
Building wheels for collected packages: hdbscan, umap-learn, pynndes

In [4]:
from google.colab import drive
import pandas as pd

In [7]:
from top2vec import Top2Vec

### Reading Dataset

In [6]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/"Colab Notebooks"
username = "IsaacOlguin"
repository =  "AutomatedTraumaDetectionInGCT"

%cd {repository}
%ls -a
%pwd

df = pd.read_csv('data/genocide-transcript-corpus-v0.1.csv', delimiter=';')
print(df.head(5))

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
/content/drive/MyDrive/Colab Notebooks/AutomatedTraumaDetectionInGCT
'01_Dataset analysis.ipynb'           [0m[01;34mdata[0m/
 02_K_means_clustering.ipynb          [01;34m.git[0m/
 03_TextSegmenter.ipynb               LICENSE
 04_TextSegmenterAndCleaning.py       [01;34mluima_sbd[0m/
 05_AnalysisTopicsDocsTop2Vec.ipynb   README.md
                                           paragraph  label  tribunal  \
0  ýý ý Kingdom of Cam bodia Nation Religion King...      0         1   
1  pag e Questioning by Mr Kar Savuth commenc es ...      0         1   
2  decision on the removal of witnesses from the ...      0         1   
3  The defence agreed with the Chambers proposal ...      0         1   
4  therefore be in a position to compare them wit...      0         1   

     witness                             document      case        date  
0  Vann Nath  E1_39.1_TR001_20090629_Final_EN_Pub  Case 001  29.06.2009  
1  Vann 

In [9]:
list_paragraphs_documents = [ paragraph for paragraph in df["paragraph"] ]

In [13]:
list_paragraphs_documents[0:3]

['ýý ý Kingdom of Cam bodia Nation Religion King Royaume du Cambodge Nation Religion Roi GgÁCMnMuRmHvisamBaØkñúgtulakark ð Trial Chamber Chambre de première instance TRANSCRIPT OF TRIAL PROCEEDINGS  KAING GUEK EAV ﬁDUCHﬂ PUBLIC Case File Nº   ECCC TC June H Trial Day Before the Judges NIL Nonn Presiding Silvia CARTWRIGHT YA Sokhan JeanMarc LAVERGNE THOU Mony YOU Ottara Reserve Claudia FENZ Reserve Trial Chamber Greffiers Legal Officers DUCH Phary SE Kolvuthy LIM SuyHong Natacha WEXELSRISER Matteo CRIPPA For the Office of the CoProsecutors William SMITH YET Chakriya PICH Sambath Zachery LAMPEL PAK Chanlino The Accused KAING Guek Eav Lawyers for the Accused KAR Savuth MariePaule CANIZARES Heleyn UÑAC Lawyers for the Civil Parties KIM Mengkhy KONG Pisey TY Srinna HONG Kimsuon MOCH Sovannary YUNG Phanit Silke STUDZINSKY Alain WERNER For Court Management Section KAUV Keoratanak CmúCa Extraordinary Chambers in the Courts of Cambodia Chambres Extraordinaires au sein des Tribunaux Cambodgiens 

In [14]:
model = Top2Vec(documents=list_paragraphs_documents, speed="learn", workers=8)

2022-11-09 12:04:10,614 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training
2022-11-09 12:04:11,656 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-11-09 12:04:30,871 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-11-09 12:04:44,936 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-11-09 12:04:44,976 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


In [15]:
# Get number of topics
model.get_num_topics()

6

In [16]:
# Get topic sizes (number of docs most similar to each topic)
topic_sizes, topic_nums = model.get_topic_sizes()
print(f"Topic sizes {topic_sizes}")
print(f"Topic nums {topic_nums}")

Topic sizes [340 292 286 237 186 134]
Topic nums [0 1 2 3 4 5]


In [17]:
# Get Topics
topic_words, word_scores, topic_nums = model.get_topics(6)

In [18]:
print(f"Topic words {topic_words}")
print(f"Word scores {word_scores}")
print(f"Topic nums {topic_nums}")

Topic words [['chambers' 'tc' 'eav' 'guek' 'eccc' 'extraordinary' 'kaing' 'courts'
  'cambodia' 'angkar' 'however' 'prisoners' 'guards' 'mey' 'workshop'
  'life' 'actually' 'regime' 'rouge' 'then' 'my' 'walked' 'case' 'walk'
  'suffering' 'eccctc' 'working' 'phnom' 'uncle' 'myself' 'rice' 'work'
  'lawyer' 'arrested' 'cooperative' 'khmer' 'water' 'detained' 'sent'
  'interrogated' 'kampuchea' 'big' 'eat' 'penh' 'evacuated' 'honour'
  'worked' 'almost' 'sometimes' 'later']
 ['milosevic' 'kla' 'youre' 'ive' 'im' 'march' 'true' 'own' 'pec' 'cant'
  'albanian' 'youve' 'sentence' 'interpretation' 'kosovo' 'didnt' 'dont'
  'albanians' 'thats' 'help' 'everything' 'answer' 'evidence' 'right'
  'its' 'ryneveld' 'things' 'interpreter' 'accused' 'villages' 'says'
  'asking' 'judge' 'police' 'else' 'very' 'ago' 'serbian' 'htm' 'clear'
  'heard' 'too' 'it' 'icty' 'mitrovica' 'event' 'say' 'attacks'
  'statement' 'possible']
 ['reporter' 'young' 'official' 'bureau' 'interahamwes' 'marilyn'
  'commun

In [20]:
# Search topics
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["interrogated"], num_topics=6)

print(f"Topic words {topic_words}")
print(f"Word scores {word_scores}")
print(f"Topic scores {topic_scores}")
print(f"Topic nums {topic_nums}")

Topic words [array(['chambers', 'tc', 'eav', 'guek', 'eccc', 'extraordinary', 'kaing',
       'courts', 'cambodia', 'angkar', 'however', 'prisoners', 'guards',
       'mey', 'workshop', 'life', 'actually', 'regime', 'rouge', 'then',
       'my', 'walked', 'case', 'walk', 'suffering', 'eccctc', 'working',
       'phnom', 'uncle', 'myself', 'rice', 'work', 'lawyer', 'arrested',
       'cooperative', 'khmer', 'water', 'detained', 'sent',
       'interrogated', 'kampuchea', 'big', 'eat', 'penh', 'evacuated',
       'honour', 'worked', 'almost', 'sometimes', 'later'], dtype='<U14'), array(['von', 'ethnic', 'type', 'icty', 'muslim', 'htm', 'httpswww',
       'luka', 'jna', 'camp', 'brcko', 'uniforms', 'camouflage',
       'recognise', 'these', 'muslims', 'serb', 'men', 'police',
       'weapons', 'wore', 'yes', 'omarska', 'see', 'serbs', 'wearing',
       'man', 'it', 'prijedor', 'villages', 'did', 'tadic', 'brigade',
       'vlasenica', 'hollis', 'war', 'held', 'dusko', 'civilian',
       '

In [21]:
for topic in topic_nums:
    model.generate_topic_wordcloud(topic)

Output hidden; open in https://colab.research.google.com to view.

#### Paragraphs that belong just to violent cases

In [29]:
list_paragraphs_documents = [ paragraph for paragraph in df[df["label"]==1]["paragraph"] ]
print(f"Num of paragraphs is {len(list_paragraphs_documents)}")

model = Top2Vec(documents=list_paragraphs_documents, speed="learn", workers=8, min_count=5)

num_detected_topics = model.get_num_topics()
print(f'Num detected topics {num_detected_topics}')

# Get topic sizes (number of docs most similar to each topic)
topic_sizes, topic_nums = model.get_topic_sizes()
print(f"Topic sizes {topic_sizes}")
print(f"Topic nums {topic_nums}")

# Get Topics
topic_words, word_scores, topic_nums = model.get_topics(num_detected_topics)
print(f"Topic words {topic_words}")

2022-11-09 12:23:35,855 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training


Num of paragraphs is 529


2022-11-09 12:23:36,645 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2022-11-09 12:23:50,732 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2022-11-09 12:23:54,544 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2022-11-09 12:23:54,564 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


Num detected topics 5
Topic sizes [187 179  80  49  34]
Topic nums [0 1 2 3 4]
Topic words [['il' 'reporter' 'official' 'bureau' 'communal' 'commune' 'interahamwes'
  'sally' 'marilyn' 'burgomaster' 'kohn' 'pursue' 'ictr' 'going'
  'akayesu' 'tutsis' 'rex' 'ictrchamber' 'hiding' 'tiangaye' 'saying'
  'interahamwe' 'tutsi' 'lear' 'pardon' 'young' 'hutu' 'marry' 'ia'
  'killing' 'qo' 'teachers' 'diagram' 'place' 'killings' 'didna'
  'tharcisse' 'inkotanyis' 'below' 'persons' 'refuge' 'kill' 'bave' 'who'
  'taba' 'talked' 'married' 'exactly' 'hot' 'rape']
 ['extraordinary' 'courts' 'case' 'cambodia' 'chambers' 'trial' 'kaing'
  'guek' 'eccc' 'eav' 'tc' 'rouge' 'khmer' 'sometime' 'eccctc' 'then'
  'walked' 'january' 'although' 'separate' 'my' 'sick' 'inflicted'
  'scarf' 'so' 'presume' 'ration' 'lawyer' 'angkar' 'tuol' 'penh' 'phnom'
  'scared' 'shackled' 'detained' 'already' 'collect' 'thirsty' 'duch'
  'sleng' 'regarding' 'blindfolded' 'could' 'experience' 'prisoners'
  'pig' 'rice' 'tow

In [30]:
# Search topics
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["killed"], num_topics=num_detected_topics)

for topic in topic_nums:
    model.generate_topic_wordcloud(topic)

Output hidden; open in https://colab.research.google.com to view.

In [31]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=3, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Document: 194, Score: 0.7177969217300415
-----------
never saw von  IT httpswww icty orgxcasestadictransen IT htm Page Q Who was it that administered these beatings to you A Zoran Obrenovic he was the one who beat most among others called Viskovic Garic and Vukavic  Q Those people you have just named who are they are they persons who beat others or are they victims who were beaten A These were people who were beating other people Q To your knowledge what was their ethnic group A They were of Serb nationality Q What type of uniform did they wear A Obrenovic Garic and Viskovic wore camouflage uniforms Q How many times did these beatings occur while you were being held here at the police Station they did not happen at all A Sometimes twice sometimes once and they were very lucky days when Q What injuries did you sustain as a result of those beatings A Those beatings I had five broken teeth and part of my leg and then on both of my hands I have marks from a knife Q When you were taken from