# Crawl Data Analysis: Clustering

This notebook analyses the output of the HDBSCAN clustering algorithm with various parameters.

## Read clustering output

In [1]:
import pandas as pd

directory = '/mnt/5tb/dark-patterns-output/'

def read_output(output_file, label_file):
    return (pd.read_csv(directory + output_file, sep='\s+', header=None, names=['cluster', 'count']),
            pd.read_pickle(directory + label_file))

(output_10_bow_euc, label_10_bow_euc) = read_output('output_10_bow_euclidean', 'label_10_bow_euclidean')
(output_5_bow_euc, label_5_bow_euc) = read_output('output_5_bow_euclidean', 'label_5_bow_euclidean')
(output_10_bow_man, label_10_bow_man) = read_output('output_10_bow_manhattan', 'label_10_bow_manhattan')
(output_5_bow_man, label_5_bow_man) = read_output('output_5_bow_manhattan', 'label_5_bow_manhattan')

(output_10_tfidf_euc, label_10_tfidf_euc) = read_output('output_10_tfidf_euclidean', 'label_10_tfidf_euclidean')
(output_5_tfidf_euc, label_5_tfidf_euc) = read_output('output_5_tfidf_euclidean', 'label_5_tfidf_euclidean')
(output_10_tfidf_man, label_10_tfidf_man) = read_output('output_10_tfidf_manhattan', 'label_10_tfidf_manhattan')
(output_5_tfidf_man, label_5_tfidf_man) = read_output('output_5_tfidf_manhattan', 'label_5_tfidf_manhattan')

## Number of clusters in each

In [2]:
output_10_bow_euc.shape

(10277, 2)

In [3]:
output_5_bow_euc.shape

(22895, 2)

In [4]:
output_10_bow_man.shape

(9974, 2)

In [5]:
output_5_bow_man.shape

(22081, 2)

In [6]:
output_10_tfidf_euc.shape

(7914, 2)

In [7]:
output_5_tfidf_euc.shape

(24824, 2)

In [8]:
output_10_tfidf_man.shape

(7493, 2)

In [9]:
output_5_tfidf_man.shape

(23466, 2)

## Size of noise cluster in each

In [10]:
output_10_bow_euc[output_10_bow_euc['cluster'] == -1]['count']

1    256573
Name: count, dtype: int64

In [11]:
output_5_bow_euc[output_5_bow_euc['cluster'] == -1]['count']

1    207168
Name: count, dtype: int64

In [12]:
output_10_bow_man[output_10_bow_man['cluster'] == -1]['count']

1    263511
Name: count, dtype: int64

In [13]:
output_5_bow_man[output_5_bow_man['cluster'] == -1]['count']

1    215928
Name: count, dtype: int64

In [14]:
output_10_tfidf_euc[output_10_tfidf_euc['cluster'] == -1]['count']

0    991177
Name: count, dtype: int64

In [15]:
output_5_tfidf_euc[output_5_tfidf_euc['cluster'] == -1]['count']

0    896145
Name: count, dtype: int64

In [16]:
output_10_tfidf_man[output_10_tfidf_man['cluster'] == -1]['count']

0    998743
Name: count, dtype: int64

In [17]:
output_5_tfidf_man[output_5_tfidf_man['cluster'] == -1]['count']

0    908278
Name: count, dtype: int64

## Attach the cluster labels to the segments

In [19]:
import json
from tqdm import tqdm
import numpy as np

hostname = []
inner_text = []
inner_text_processed = []

with_rows = np.loadtxt(directory + 'rows_with_elements_bow.arr')

counter = 0
with open(directory + 'segments_unique.json') as f:
    for line in tqdm(f):
        seg = json.loads(line)
        
        if counter in with_rows:
            hostname.append(seg['hostname'])
            inner_text.append(seg['inner_text'])  
            inner_text_processed.append(seg['inner_text_processed'])  
            
        counter += 1

frame = pd.DataFrame({'hostname': hostname, 
                      'inner_text': inner_text, 
                      'inner_text_processed': inner_text_processed,

                      'cluster_10_bow_euc': label_10_bow_euc.values,
                      'cluster_5_bow_euc': label_5_bow_euc.values,
                      'cluster_10_bow_man': label_10_bow_man.values,
                      'cluster_5_bow_man': label_5_bow_man.values,

                      'cluster_10_tfidf_euc': label_10_tfidf_euc.values,
                      'cluster_5_tfidf_euc': label_5_tfidf_euc.values,
                      'cluster_10_tfidf_man': label_10_tfidf_man.values,
                      'cluster_5_tfidf_man': label_5_tfidf_man.values})


1287426it [14:00, 1531.14it/s]


In [20]:
frame.shape

(1240588, 11)

In [27]:
frame.loc[:, frame.columns != 'inner_text_processed'].to_csv(directory + 'clusters.csv', encoding='utf-8', index=False)

frame.to_csv(directory + 'clusters_with_processed_text.csv', encoding='utf-8', index=False)

In [28]:
frame.to_pickle(directory + 'clusters_with_processed_text.pickle')