In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords
import json
import glob
import re

In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [3]:
def remove_stops(text, stops):
    text = re.sub(r"AC\/\d{1,4}\/\d{1,4}", "", text)
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    final = "".join([i for i in final if not i.isdigit()])
    while "  " in final:
        final = final.replace("  ", " ")
    return (final)

In [4]:
def clean_docs(docs):
    stops = stopwords.words("english")
    months = load_data("data/months.json")
    stops = stops+months
    final = []
    for doc in docs:
        clean_doc = remove_stops(doc, stops)
        final.append(clean_doc)
    return (final)

In [5]:
descriptions = load_data("data/trc_dn.json")["descriptions"]
names = load_data("data/trc_dn.json")["names"]

In [6]:
descriptions

["An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.",
 'A member of the SADF who was severely injured in a landmine explosion in Messina, Transvaal, on 5 May 1987.',
 'A member of QIBLA who disappeared in September 1988 after fleeing the country for political reasons. He had been detained several times before he left South Africa.',
 'A COSAS supporter who was kicked and beaten with batons and rifle-butts by members of the Ciskei Police during protests against the Ciskei government at Zwelitsha and Mdantsane, Ciskei, in September 1985.',
 'Was shot and blinded in one eye by members of the SAP in Athlone, Cape Town, in August 1976, after the Soweto uprising.',
 'Was shot and injured by members of the SAP in Robertson, Cape, on 11 

In [7]:
names

['AARON, Thabo Simon',
 'ABBOTT, Montaigne',
 'ABDUL WAHAB, Zakier',
 'ABRAHAM, Nzaliseko Christopher',
 'ABRAHAMS, Achmat Fardiel',
 'ABRAHAMS, Annalene Mildred',
 'ABRAHAMS, Ashraf',
 'ABRAHAMS, Derrek',
 "ABRAHAMS, John (aka 'Gaika')",
 'ABRAHAMS, Moegsien',
 'ABRAHAMS, Rashid',
 'ABRAHAMS, Toyer',
 'ACHHURST, EM',
 'ACKERMAN, David Jacobus',
 'ACKERMAN, Marita',
 'ACKERMANN, Gerhardus Oliver',
 'ADAM, Alfred Mawonga',
 'ADAM, Alfred Mawonga',
 'ADAMS,  Zwelinzima Sidwell',
 'ADAMS, Koos',
 'ADAMS, Magadien',
 'ADAMS, Noel',
 'ADAMS, Sandra Joyce',
 'ADONIS,  Jacques Ferdinand',
 'ADONIS, Motlalepule Sunnyboy Slovo',
 'ADONIS, Sandra Noreen',
 'ADOONS, Phineus Zenzile',
 'ADRIAANSE, Noel John',
 'AFRICANDER, Sipho Victor',
 'AFRIKA, Anna',
 'AFRIKA, Dick',
 'AFRIKA, Jan',
 'AFRIKA, Pieter',
 'AGGETT, Joyce',
 'AGGETT, Neil Hudson',
 'ALA, Wezeka Getrude',
 'ALA, Wezeka Getrude',
 'ALBERT, Nombuyiselo Francis',
 'ALEXANDER, Tammas Edward',
 'ALFRED, Nomthandazo Amelia',
 'ALLAM, Erne

In [8]:
cleaned_docs = clean_docs(descriptions)

In [None]:
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features=100, ## Build a vocabulary that considers only top 100 features ordered by term freq in the corpus
                                max_df=0.8, ## If a word occurs in 80% of docs it is going to be ignored (too frequent)
                                min_df=5, ## If a word occurs in less than 5 docs, it is going to be ignored
                                ngram_range = (1,3),
                                stop_words = "english"

                            )

In [10]:
vectors = vectorizer.fit_transform(cleaned_docs)

In [11]:
vectors

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 251325 stored elements and shape (21747, 100)>

In [12]:
feature_names = vectorizer.get_feature_names_out()

In [13]:
feature_names

array(['allegedly', 'amnesty', 'anc', 'anc supporter',
       'anc supporter shot', 'anc supporters', 'area', 'arrested',
       'arson', 'arson attack', 'assaulted', 'attack', 'attacked',
       'attacks', 'beaten', 'bophuthatswana', 'branch', 'burnt',
       'burnt ifp', 'burnt ifp supporters', 'cape', 'chief', 'conflict',
       'conflict area', 'dead', 'death', 'destroyed', 'detained', 'died',
       'durban', 'empangeni', 'empangeni natal', 'family', 'granted',
       'granted amnesty', 'home', 'home burnt', 'house', 'house burnt',
       'ifp', 'ifp supporter', 'ifp supporters', 'injured', 'inkatha',
       'inkatha supporters', 'johannesburg', 'killed', 'kwamashu',
       'kwamashu durban', 'kwazulu', 'kwazulu near',
       'kwazulu near durban', 'lost', 'member', 'members', 'members sap',
       'mk', 'mk operatives', 'mr', 'named', 'natal', 'ndwedwe',
       'ndwedwe kwazulu', 'ndwedwe kwazulu near', 'near', 'near durban',
       'ongoing', 'operatives', 'people', 'people kill

In [14]:
dense = vectors.todense()
denselist = dense.tolist()




TFIDF gives you the importance of a specific term in a sentence by giving weight > 0. You can get those terms or features

In [16]:
denselist

[[0.0,
  0.0,
  0.35997263491995035,
  0.0,
  0.0,
  0.31417904033665145,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.26764577237075254,
  0.0,
  0.0,
  0.0,
  0.0,
  0.24755257133949066,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.28809076545860496,
  0.20314225251689294,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.27507572242905154,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.5249891606885219,
  0.0,
  0.0,
  0.3047379292177127,
  0.0,
  0.19595009171387526,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.17885669660417403,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,

In [17]:
all_keywords = []

for description in denselist:
    x=0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(feature_names[x])
        x=x+1
    all_keywords.append(keywords)
print (descriptions[0])
print (all_keywords[0])

An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.
['anc', 'anc supporters', 'house', 'injured', 'member', 'members', 'police', 'sap', 'severely', 'shot', 'supporters']


In [None]:
true_k = 20

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

model.fit(vectors)

## orders centroids and terms

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()




In [19]:
order_centroids

array([[83, 24, 82, ..., 18, 15,  4],
       [42, 84, 82, ..., 57, 19, 18],
       [47, 48, 29, ..., 30, 31, 15],
       ...,
       [56, 57, 67, ...,  9,  8,  4],
       [55, 77, 54, ..., 62, 19, 18],
       [14, 81, 80, ..., 18, 19,  4]], shape=(20, 100))

In [23]:
terms

array(['allegedly', 'amnesty', 'anc', 'anc supporter',
       'anc supporter shot', 'anc supporters', 'area', 'arrested',
       'arson', 'arson attack', 'assaulted', 'attack', 'attacked',
       'attacks', 'beaten', 'bophuthatswana', 'branch', 'burnt',
       'burnt ifp', 'burnt ifp supporters', 'cape', 'chief', 'conflict',
       'conflict area', 'dead', 'death', 'destroyed', 'detained', 'died',
       'durban', 'empangeni', 'empangeni natal', 'family', 'granted',
       'granted amnesty', 'home', 'home burnt', 'house', 'house burnt',
       'ifp', 'ifp supporter', 'ifp supporters', 'injured', 'inkatha',
       'inkatha supporters', 'johannesburg', 'killed', 'kwamashu',
       'kwamashu durban', 'kwazulu', 'kwazulu near',
       'kwazulu near durban', 'lost', 'member', 'members', 'members sap',
       'mk', 'mk operatives', 'mr', 'named', 'natal', 'ndwedwe',
       'ndwedwe kwazulu', 'ndwedwe kwazulu near', 'near', 'near durban',
       'ongoing', 'operatives', 'people', 'people kill

In [27]:
with open ("data/trc_results.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        print(f"Cluster {i}")
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            ### get top 10 words for each cluster
            # print(ind)
            print(' %s' % terms[ind],)
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")

Cluster 0
 shot dead
 dead
 shot
 member
 transvaal
 members
 ifp
 named
 anc
 police
Cluster 1
 injured
 shot injured
 shot
 members
 people
 member
 killed
 attack
 transvaal
 severely
Cluster 2
 kwamashu
 kwamashu durban
 durban
 near
 conflict
 supporters
 burnt
 political conflict
 political
 anc
Cluster 3
 burnt
 burnt ifp supporters
 burnt ifp
 house
 ifp
 supporter house
 ifp supporters
 supporters
 house burnt
 anc
Cluster 4
 death
 stabbed
 attack
 transvaal
 anc
 member
 killed
 home
 members
 burnt
Cluster 5
 ifp
 supporters
 anc
 natal
 ifp supporters
 anc supporters
 near
 conflict
 supporter
 kwazulu
Cluster 6
 anc supporter shot
 supporter shot
 anc supporter
 anc
 supporter
 shot
 injured
 shot injured
 ifp
 natal
Cluster 7
 bophuthatswana
 police
 members
 beaten
 severely
 severely beaten
 arrested
 tvl
 assaulted
 chief
Cluster 8
 inkatha
 inkatha supporters
 udf
 supporters
 near
 kwazulu
 supporter
 kwazulu near
 durban
 near durban
Cluster 9
 police
 detained
 as

Each sentence is broken down into only specific words using TFIDF importance and then all those words in the corpus are given to Kmeans for clustering. The outcome is each sentence is assigned to 1 cluster. 

One drawback of Kmeans is that it forces a sentence to belong to 1 topic/Cluster, but that is not the case, one sentence could be speaking about multiple topics.