# Using DBSCAN as clustering method

In [2]:
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot as plt

## Test

In [3]:
states = ["INITIAL","login","View_Items","home","logout","View_Items_quantity","Add_to_Cart","shoppingcart",
          "remove","deferorder","purchasecart","inventory","sellinventory","clearcart","cancelorder","$"]

In [4]:
sessions = {'HZKS0-WG8pZr0eCsZlBAP5Xm': ['login',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'home',
  'logout'],
 '5tPgZbHdK2Zp+heFBs8HsMkx': ['login',
  'View_Items_quantity',
  'Add_to_Cart',
  'View_Items_quantity',
  'Add_to_Cart',
  'View_Items_quantity',
  'Add_to_Cart',
  'View_Items_quantity',
  'Add_to_Cart',
  'View_Items_quantity',
  'Add_to_Cart',
  'shoppingcart',
  'remove',
  'shoppingcart',
  'remove',
  'shoppingcart',
  'remove',
  'shoppingcart',
  'remove',
  'deferorder',
  'home',
  'logout'],
 'RU2oCVNdpWEM0-2x7I5OjPbZ': ['login',
  'View_Items_quantity',
  'Add_to_Cart',
  'View_Items_quantity',
  'Add_to_Cart',
  'purchasecart',
  'home',
  'logout'],
 'kG4g0E5mqwRYcsQOCfj+7wG7': ['login',
  'inventory',
  'inventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'home',
  'logout'],
 '8ocO6WP3QaFpBvkooS5INPwe': ['login',
  'inventory',
  'inventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'home',
  'logout'],
 'WOTZQBwSCnI+DfDQ-2cS7Mgp': ['login',
  'inventory',
  'inventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'home',
  'logout'],
 'e4bMe1HfiUlmvUMmPJU4y1B4': ['login',
  'inventory',
  'inventory',
  'sellinventory',
  'sellinventory',
  'sellinventory',
  'home',
  'logout'],
 'MEzDpcnm1MQ9GFGox7uP4Ep-': ['login',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'View_Items',
  'home',
  'logout']}

### Makovchain & sparse matrix

In [4]:
def transition_matrix(sessions, states):
    markovchains = []
    for key, value in sessions.items():
        # labelEncoding
        le = preprocessing.LabelEncoder()
        le.fit(value)
        transformed_s = le.transform(value)

        #factorize
        factorize = pd.factorize(value)[0]
        
        # matrix
        n = 1 + max(factorize)  # number of states
        M = [[0] * n for _ in range(n)]

        for (i, j) in zip(factorize, factorize[1:]):
            M[i][j] += 1
        
        # now convert to probabilities:
        for row in M:
            s = sum(row)
            if s > 0:
                row[:] = [f / s for f in row]
                
        # print Matrix style
        #for row in M: print(' '.join('{0:.2f}'.format(x) for x in row))
        
        # unique array in the right order
        value = np.array(value)
        _, idx = np.unique(value, return_index=True)
        
        df = pd.DataFrame(data = M, index=value[np.sort(idx)],
                          columns=value[np.sort(idx)])

        df_1 = pd.DataFrame(index=states, columns=states, dtype='float64')

        #merge = df_1.merge(df, how='right').fillna(0).round(2).set_index(value[np.sort(idx)])
        merge = df_1.update(df, join='left')
        #merge = pd.merge(df_1, df, how='right').fillna(0).round(2).set_index(value[np.sort(idx)])
        
        
        merge = pd.concat([pd.concat([df_1, df], axis=1, sort= False)], axis=0).fillna(0).round(2).iloc[:, :-n] 
        #merge = merge.iloc[:, :-n]        
        
        
        # convert into Vector
        merge = np.array(merge.values.flatten().tolist())
        #print(len(merge))
        # resize so the vectors got the same length
        #size = 16*16
        #merge.resize(size)
        
        # 2-D array 
        markovchains.append(merge)
        #print(len(markovchains))
        # csr sparse matrix
        csr = csr_matrix(markovchains)
        #print(csr.shape)
        #markovchains.append(merge)
        
    #print(len(merge))
    return csr

#m = transition_matrix(sessions, states)

In [4]:
X = m
clustering = DBSCAN(eps=2, min_samples=2).fit(X)
labels = clustering.labels_
print(labels)
print(np.unique(labels, return_counts=True))
print(clustering)

[ 0 -1 -1  0  0  0  0  0]
(array([-1,  0], dtype=int64), array([2, 6], dtype=int64))
DBSCAN(algorithm='auto', eps=2, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=2, n_jobs=None, p=None)


## Backpropagate after clustering

In [5]:
#Data imports
PATH = "../../data/raw/"
sessions_file = (PATH+'sessions.dat')

In [6]:
def session_request_dict(sessions_file):
    s_r_dict = {}
    # Dict of sessions
    with open(sessions_file) as fn:
        sessions_raw = fn.readlines()

    for session in sessions_raw:
        key = re.search('([^.]+)', session).group()
        value = re.findall('\"(.*?)\"', session)
        s_r_dict[key] = value

    return s_r_dict

In [7]:
data = session_request_dict(sessions_file)

set_1 = {k: data[k] for k in list(data)[0:1000]}
set_2 = {k: data[k] for k in list(data)[500:1500]}

In [8]:
#Dict_Cluster
def cluster_dict(labels, X_):
    cluster_list =[]
    
    for label in np.unique(labels):
        points = X_[labels == label].toarray()
        
        for point in points:
            cluster_dict = {}
            cluster_dict[label] = point
            cluster_list.append(cluster_dict)
            
    return cluster_list

In [9]:
X_1 = transition_matrix(set_1, states)
X_2 = transition_matrix(set_2, states)

#print('matrix done', datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
#print('start clustering\n')

clustering_1 = DBSCAN(eps=1.5, min_samples=10).fit(X_1)
clustering_2 = DBSCAN(eps=1.5, min_samples=10).fit(X_2)

labels_1 = clustering_1.labels_
labels_2 = clustering_2.labels_

# clustering1.components_.toarray()
#n_clusters = len(np.unique(labels))
#first_list = X_1[labels_1 == 0].toarray()
#second_list = X_2[labels_2 == 0].toarray()


cluster_dict_1 = cluster_dict(labels_1, X_1)
cluster_dict_2 = cluster_dict(labels_2, X_2)

print(np.unique(labels_1, return_counts=True))
print(np.unique(labels_2, return_counts=True))

(array([0, 1, 2], dtype=int64), array([485, 261, 254], dtype=int64))
(array([0, 1, 2], dtype=int64), array([283, 481, 236], dtype=int64))


To do:
- array mit den labels rausziehen, dass dann so dynamsich zu gestalten for list_cluster, list cluster itteriert durch und erstellt die listen (first list, usw.) diese sollten dann weitergeleitet werden in eine funktion die dynmisch die brechnungne macht. 

In [10]:
def list_cluster(cluster_dict_):
    cluster_list = []
    if labels_1 in labels_2:
        for cluster_index, value in enumerate(np.unique(labels_1)):
            tmp = []
            for item in cluster_dict_:
                for k,v in item.items():
                    if k == cluster_index:
                        tmp.append(v.tolist())
            cluster_list.append([np.mean(tmp)])
    return cluster_list

first_list = list_cluster(cluster_dict_1)

second_list = list_cluster(cluster_dict_2)
print(first_list)


[[0.01171875], [0.022493414750957855], [0.015748031496062992]]


In [11]:
print(second_list)

[[0.01577683303886926], [0.01171875], [0.022608746027542374]]


Kreuztest Verschiebung der Cluster

Source:
    
https://stackoverflow.com/questions/18237479/dbscan-in-scikit-learn-of-python-save-the-cluster-points-in-an-array


In [15]:
"""
TODO: Durchlauf, check if liste != dann die "alte" liste benutzen. Dann müssen diese punkte gar nicht mehr neu genommen werden
"""

from tqdm import tqdm

mylist = [first_list, second_list]
old_min_points = {}
cluster_mean_history={}

p
for index, value in enumerate(first_list):
    
    min_point_dict = {}
    value_subtraction = []
    second_list_tmp_dict = {}
    
    for second_index, second_value in enumerate(second_list):
        
        second_list_tmp_dict[second_index]=second_value
        
        min_point_dict[abs(np.array(value)-np.array(second_value))[0]]=[index, second_index]
        
        value_subtraction.append(abs(np.array(value)-np.array(second_value)))
        

    old_min_points[min_point_dict[min(value_subtraction)[0]][0]] = second_list_tmp_dict[
        min_point_dict[min(value_subtraction)[0]][1]
    ]
    
    
    
    """cluster_mean_history[min_point_dict[min(value_subtraction)[0]][0]] = second_list_tmp_dict[
    min_point_dict[min(value_subtraction)[0]][1]
    ]
    
    t = expand(cluster_mean_history, index)
    print(t)"""
    
    
        
    #print("min-point",min(value_subtraction)) #index noch außerhalb
    #print(min_point_dict[min(value_subtraction)[0]][1])

#old_min_points
cluster_mean_history

TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')

In [1]:
def expand(cluster_mean_history, index):
    list = cluster_mean_history[index]
    list.append("123")
    return list

---

In [25]:
first_list = [[1, 3], [13, 2]]
first_list_1 = [[1, 3], [13, 2]]
second_list = [[1, 2], [13, 2]]
print(len(np.array(first_list).shape))

2


In [3]:
differences = []

mylist = [first_list,first_list_1,second_list]

for index, value in enumerate(mylist):
    tmp = []
    for elements in mylist[:]:
        for list in value:
            if list not in elements:
                differences_dict = {}
                tmp.append(list)
                differences_dict[index]=len(tmp)
                if len(tmp) > 1:
                    del differences[-1]
                differences.append(differences_dict)
differences


[{0: 1}, {1: 1}, {2: 2}]

[[[1, 3], [13, 2]], [[1, 3], [13, 2]], [[1, 2], [13, 2]]]


Alle Punkte in einem Cluster miteinander vergleichen.

https://stackoverflow.com/questions/16603282/how-to-compare-each-item-in-a-list-with-the-rest-only-once

In [16]:
first_tuple_list = [tuple(lst) for lst in first_list]
second_tuple_list = [tuple(lst) for lst in second_list]
#print(first_tuple_list)

In [17]:
first_set = set(first_tuple_list)
second_set = set(second_tuple_list)
#print(first_set)

In [18]:
first_set = set(map(tuple, first_list))
second_set = set(map(tuple, second_list))

In [19]:
len(frozenset(first_set).intersection(second_set))

8

Source: 

https://stackoverflow.com/questions/6105777/how-to-compare-a-list-of-lists-sets-in-python
https://stackoverflow.com/questions/1388818/how-can-i-compare-two-lists-in-python-and-return-matches

---