# Crawl Data Analysis: Clustering

This notebook analyses the output of the HDBSCAN clustering algorithm with various parameters.

@ Petr Hanzl: Analysis for TFIDF is omitted, because of the same reason as mentioned in Feature Processing.ipynb notebook.

## Read clustering output

In [3]:
import pandas as pd

directory = '/home/xhanpet/segments-output/'

import pickle5 as pickle

pb = pickle.PickleBuffer(b"foo")
data = pickle.dumps(pb, protocol=5)
assert pickle.loads(data) == b"foo"


def read_output(output_file, label_file):
    print(directory + label_file)
    return (pd.read_csv(directory + output_file, sep='\s+', header=None, names=['cluster', 'count']),
            pd.read_pickle(directory + label_file))
(output_10_bow_euc, label_10_bow_euc) = read_output('output_10_bow_euclidean', 'label_10_bow_euclidean')
(output_5_bow_euc, label_5_bow_euc) = read_output('output_5_bow_euclidean', 'label_5_bow_euclidean')
(output_10_bow_man, label_10_bow_man) = read_output('output_10_bow_manhattan', 'label_10_bow_manhattan')
(output_5_bow_man, label_5_bow_man) = read_output('output_5_bow_manhattan', 'label_5_bow_manhattan')

/home/xhanpet/segments-output/label_10_bow_euclidean
/home/xhanpet/segments-output/label_5_bow_euclidean
/home/xhanpet/segments-output/label_10_bow_manhattan
/home/xhanpet/segments-output/label_5_bow_manhattan


## Number of clusters in each

In [9]:
output_10_bow_euc.shape

(89, 2)

In [10]:
output_5_bow_euc.shape

(178, 2)

In [11]:
output_10_bow_man.shape

(88, 2)

In [12]:
output_5_bow_man.shape

(167, 2)

## Size of noise cluster in each

In [13]:
output_10_bow_euc[output_10_bow_euc['cluster'] == -1]['count']

Series([], Name: count, dtype: object)

In [14]:
output_5_bow_euc[output_5_bow_euc['cluster'] == -1]['count']

Series([], Name: count, dtype: object)

In [15]:
output_10_bow_man[output_10_bow_man['cluster'] == -1]['count']

Series([], Name: count, dtype: object)

In [16]:
output_5_bow_man[output_5_bow_man['cluster'] == -1]['count']

Series([], Name: count, dtype: object)

## Attach the cluster labels to the segments

In [18]:
import json
from tqdm import tqdm
import numpy as np

hostname = []
inner_text = []
inner_text_processed = []

with_rows = np.loadtxt(directory + 'rows_with_elements_bow.arr')

counter = 0
with open(directory + 'segments_unique.json') as f:
    for line in tqdm(f):
        seg = json.loads(line)
        
        if counter in with_rows:
            hostname.append(seg['hostname'])
            inner_text.append(seg['inner_text'])  
            inner_text_processed.append(seg['inner_text_processed'])  
            
        counter += 1

frame = pd.DataFrame({'hostname': hostname, 
                      'inner_text': inner_text, 
                      'inner_text_processed': inner_text_processed,

                      'cluster_10_bow_euc': label_10_bow_euc.values,
                      'cluster_5_bow_euc': label_5_bow_euc.values,
                      'cluster_10_bow_man': label_10_bow_man.values,
                      'cluster_5_bow_man': label_5_bow_man.values,
                     })


18365it [00:00, 41605.66it/s]


In [19]:
frame.shape

(13398, 7)

In [20]:
frame.loc[:, frame.columns != 'inner_text_processed'].to_csv(directory + 'clusters.csv', encoding='utf-8', index=False)

frame.to_csv(directory + 'clusters_with_processed_text.csv', encoding='utf-8', index=False)

In [21]:
frame.to_pickle(directory + 'clusters_with_processed_text.pickle')