# Incident clustering

This notebook clusters incidents together using a very naive Kmeans on top of TD-IDF vectorization.
It is simple, but works well to group related notifications and pin-point the top-offenders in your alerting system 

Tune `KMEANS_CLUSTERS` below to get less or more clusters. 

As a prerequisite, you will need to run `poetry run oncall-analysis pagerduty incidents-log --start-date 2023-01-01` to get the data for analysis



In [None]:
import re
%matplotlib inline

import pandas as pd
from lets_plot import *
LetsPlot.setup_html()


## Get the data from Pagerduty

In [None]:
df = pd.read_csv("data/incident_log.csv")
df = df.assign(week=df.created_at.dt.strftime('%Y-%m-%d')).sort_values('week')
df.sort_values('week')

## Use Kmeans to clusterize incidents together

- https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html
- KMEANS_CLUSTERS = 10

In [None]:
KMEANS_CLUSTERS = 10

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.cluster import KMeans

# 
# TF-IDF vectorization
#

import re
def preprocess(title):
    return re.sub(r'[0-9]+', '', title)

tokenizer = RegexpTokenizer(r'\w+')
tf_idf_vect = TfidfVectorizer(lowercase=True,
                              stop_words='english',
                              ngram_range = (1,1),
                              tokenizer = tokenizer.tokenize)
vectorized = tf_idf_vect.fit_transform(df.title.map(preprocess))

kmeans = KMeans(n_clusters=KMEANS_CLUSTERS)
kmeans.fit(vectorized)

labels = kmeans.labels_

clustered = df.assign(cluster=labels)

counts = clustered.groupby('cluster', as_index=True).aggregate('count')

clustered

## Get some common terms for clustered data and group counts for each cluster

In [None]:
## print feature
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tf_idf_vect.get_feature_names_out()

def getterms(cluster_index):
    return " ".join([terms[ind] for ind in order_centroids[cluster_index, :KMEANS_CLUSTERS]])

common_terms = [getterms(i) for i in range(KMEANS_CLUSTERS)]
termsdf = pd.DataFrame({'cluster': range(KMEANS_CLUSTERS), 'terms': common_terms})
joined = termsdf.join(counts).sort_values('id', ascending=False)

pd.options.display.max_colwidth = 500

pd.set_option('display.max_rows', None)

joined

In [None]:
clustered[clustered.cluster == 2]

## Write clustesrs to data/clustered.xlsx to enjoy Excel's filtering and pivoting capabilities

In [None]:
## Write the clustered incidents to data/clustered.xlsx file and enjoy reviews with your team 
(
    clustered[['title', 'description', 'created_at', 'cluster']]
        .assign(created_at=df.created_at.dt.strftime('%Y-%m-%d %H:%m:%S'))
        .to_excel('data/clustered.xlsx')
)