# Using DBSCAN as clustering method

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot as plt

## Test

In [2]:
states = ["INITIAL","login","View_Items","home","logout","View_Items_quantity","Add_to_Cart","shoppingcart",
          "remove","deferorder","purchasecart","inventory","sellinventory","clearcart","cancelorder","$"]

### Makovchain & sparse matrix

In [3]:
def transition_matrix(sessions, states):
    markovchains = []
    for key, value in sessions.items():
        # labelEncoding
        le = preprocessing.LabelEncoder()
        le.fit(value)
        transformed_s = le.transform(value)

        #factorize
        factorize = pd.factorize(value)[0]
        
        # matrix
        n = 1 + max(factorize)  # number of states
        M = [[0] * n for _ in range(n)]

        for (i, j) in zip(factorize, factorize[1:]):
            M[i][j] += 1
        
        # now convert to probabilities:
        for row in M:
            s = sum(row)
            if s > 0:
                row[:] = [f / s for f in row]
                
        # print Matrix style
        #for row in M: print(' '.join('{0:.2f}'.format(x) for x in row))
        
        # unique array in the right order
        value = np.array(value)
        _, idx = np.unique(value, return_index=True)
        
        df = pd.DataFrame(data = M, index=value[np.sort(idx)],
                          columns=value[np.sort(idx)])

        df_1 = pd.DataFrame(index=states, columns=states, dtype='float64')

        #merge = df_1.merge(df, how='right').fillna(0).round(2).set_index(value[np.sort(idx)])
        merge = df_1.update(df, join='left')
        #merge = pd.merge(df_1, df, how='right').fillna(0).round(2).set_index(value[np.sort(idx)])
        
        
        merge = pd.concat([pd.concat([df_1, df], axis=1, sort= False)], axis=0).fillna(0).round(2).iloc[:, :-n] 
        #merge = merge.iloc[:, :-n]        
        
        
        # convert into Vector
        merge = np.array(merge.values.flatten().tolist())
        #print(len(merge))
        # resize so the vectors got the same length
        #size = 16*16
        #merge.resize(size)
        
        # 2-D array 
        markovchains.append(merge)
        #print(len(markovchains))
        # csr sparse matrix
        csr = csr_matrix(markovchains)
        #print(csr.shape)
        #markovchains.append(merge)
        
    #print(len(merge))
    return csr

#m = transition_matrix(sessions, states)

## Backpropagate after clustering

In [4]:
#Data imports
PATH = "../../data/raw/"
sessions_file = (PATH+'sessions.dat')

In [5]:
def session_request_dict(sessions_file):
    s_r_dict = {}
    # Dict of sessions
    with open(sessions_file) as fn:
        sessions_raw = fn.readlines()

    for session in sessions_raw:
        key = re.search('([^.]+)', session).group()
        value = re.findall('\"(.*?)\"', session)
        s_r_dict[key] = value

    return s_r_dict

In [6]:
data = session_request_dict(sessions_file)

set_1 = {k: data[k] for k in list(data)[0:1000]}
set_2 = {k: data[k] for k in list(data)[500:1500]}

In [7]:
#Dict_Cluster
def cluster_dict(labels, X_):
    cluster_list =[]
    
    for label in np.unique(labels):
        points = X_[labels == label].toarray()
        
        for point in points:
            cluster_dict = {}
            cluster_dict[label] = point
            cluster_list.append(cluster_dict)
            
    return cluster_list

In [8]:
X_1 = transition_matrix(set_1, states)
X_2 = transition_matrix(set_2, states)

#print('matrix done', datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
#print('start clustering\n')

clustering_1 = DBSCAN(eps=1.5, min_samples=10).fit(X_1)
clustering_2 = DBSCAN(eps=1.5, min_samples=10).fit(X_2)

labels_1 = clustering_1.labels_
labels_2 = clustering_2.labels_

# clustering1.components_.toarray()
#n_clusters = len(np.unique(labels))
#first_list = X_1[labels_1 == 0].toarray()
#second_list = X_2[labels_2 == 0].toarray()


cluster_dict_1 = cluster_dict(labels_1, X_1)
cluster_dict_2 = cluster_dict(labels_2, X_2)

print(np.unique(labels_1, return_counts=True))
print(np.unique(labels_2, return_counts=True))

(array([0, 1, 2], dtype=int64), array([485, 261, 254], dtype=int64))
(array([0, 1, 2], dtype=int64), array([283, 481, 236], dtype=int64))


---

In [16]:
def list_cluster(cluster_dict_):
    cluster_list = []
    if np.unique(labels_1) in np.unique(labels_2):
        for cluster_index, value in enumerate(np.unique(labels_1)):
            tmp = []
            for item in cluster_dict_:
                for k,v in item.items():
                    if k == cluster_index:
                        tmp.append(v.tolist())
            cluster_list.append(np.mean(tmp, axis=0))
    return cluster_list

first_list = list_cluster(cluster_dict_1)

second_list = list_cluster(cluster_dict_2)

In [11]:
def one_cluster(cluster_dict_):
    result = {}
    cluster_list = []
    if np.unique(labels_1) in np.unique(labels_2):
        for cluster_index, value in enumerate(np.unique(labels_1)):
            tmp = []
            for item in cluster_dict_:
                for k,v in item.items():
                    if k == cluster_index:
                        tmp.append(v.tolist())
            cluster_list.append(np.mean(tmp, axis=0))
            
    for index, value in enumerate(cluster_list):
        result[str(index)] = value
        
    return result

first_list = one_cluster(cluster_dict_1)
first_list

{'0': array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.93, 0.07, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0

https://stackoverflow.com/questions/21193682/convert-a-string-key-to-int-in-a-dictionary

https://stackoverflow.com/questions/16819222/how-to-return-dictionary-keys-as-a-list-in-python

In [20]:
from tqdm import tqdm

mylist = [first_list, second_list]
old_min_points = {}
cluster_mean_history={}


for index, value in enumerate(first_list):
    #print(index,value)
    min_point_dict = {}
    value_subtraction_sum_dict = {}
    second_list_dict = {}
    sum_mean_vector_list =[]
    
    for second_index, second_value in enumerate(second_list):
        # Indexing dict from second_list
        second_list_dict[second_index] = second_value
        # Min vector subtraction with first and second index
        min_point_dict[sum(abs(np.array(value)-np.array(second_value)))]=[index, second_index]
        # Sum of the subtraction and the abs vector subtraction (for matching) 
        value_subtraction_sum_dict[sum(abs(np.array(value)-np.array(second_value)))] = abs(np.array(value)-np.array(second_value))
        # sum mean vector as list for matching 
        sum_mean_vector_list.append(sum(abs(np.array(value)-np.array(second_value))))
    

    # Normal labled clusterpoints bevore compute the shifting
    old_min_points[min_point_dict[min(sum_mean_vector_list)][0]] = second_list_dict[
        value_subtraction_sum_dict[min(sum_mean_vector_list)][1]
    ]
    
    # Labeling the new cluster points to the shifting cluster
    cluster_mean_history[min_point_dict[min(sum_mean_vector_list)][0]] = second_list_dict[
        value_subtraction_sum_dict[min(sum_mean_vector_list)][1]
    ]
    
    # No. of cluster with corresponding clusterpoints
    tmp=[second_list_dict[min_point_dict[min(sum_mean_vector_list)][1]]]
    #tmp.append(value)
    
    cluster_mean_history[index]=tmp
    
    #print(min_point_dict[min(sum_mean_vector_list)])


print(cluster_mean_history)

{0: [array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.93, 0.07, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0. 

       0.        ])]}


In [11]:
#print(cluster_mean_history)
for k, v in cluster_mean_history.items():
    #np.array(v).reshape(-1, len(states))
    #v.reshape(-1, len(states))
    print(np.array(v).reshape(-1, len(states)))

[[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.93 0.07 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.

---

In [15]:
def list_cluster(cluster_dict_):
    cluster_list = []
    if labels_1 in labels_2:
        for cluster_index, value in enumerate(np.unique(labels_1)):
            tmp = []
            for item in cluster_dict_:
                for k,v in item.items():
                    if k == cluster_index:
                        tmp.append(v.tolist())
            #print(tmp)
            cluster_list.append([np.mean(tmp)])
    return cluster_list

first_list = list_cluster(cluster_dict_1)

second_list = list_cluster(cluster_dict_2)
#print(first_list)


Source:
    
https://stackoverflow.com/questions/18237479/dbscan-in-scikit-learn-of-python-save-the-cluster-points-in-an-array


In [18]:
"""
TODO: Durchlauf, check if liste != dann die "alte" liste benutzen. Dann müssen diese punkte gar nicht mehr neu genommen werden
"""

from tqdm import tqdm

mylist = [first_list, second_list]
old_min_points = {}
cluster_mean_history={}


for index, value in enumerate(first_list):
    #print(index, value)
    min_point_dict = {}
    value_subtraction = []
    second_list_tmp_dict = {}
    
    for second_index, second_value in enumerate(second_list):
        
        second_list_tmp_dict[second_index] = second_value
        
        min_point_dict[abs(np.array(value)-np.array(second_value))[0]]=[index, second_index]
        
        value_subtraction.append(abs(np.array(value)-np.array(second_value)))
    
    print(second_list_tmp_dict)
    #print(min_point_dict)
    
    old_min_points[min_point_dict[min(value_subtraction)[0]][0]] = second_list_tmp_dict[
        min_point_dict[min(value_subtraction)[0]][1]
    ]
    print(old_min_points)
    
    cluster_mean_history[min_point_dict[min(value_subtraction)[0]][0]] = second_list_tmp_dict[
    min_point_dict[min(value_subtraction)[0]][1]
    ]
    
    tmp=[second_list_tmp_dict[min_point_dict[min(value_subtraction)[0]][1]][0]]
    tmp.append(value[0])
    #print(tmp)
    cluster_mean_history[index]=tmp
    

    print("min-point",min(value_subtraction)) #index noch außerhalb
    print(min_point_dict[min(value_subtraction)[0]][1])


cluster_mean_history

{0: [0.01577683303886926], 1: [0.01171875], 2: [0.022608746027542374]}
{0: [0.01171875]}
min-point [0.]
1
{0: [0.01577683303886926], 1: [0.01171875], 2: [0.022608746027542374]}
{0: [0.01171875], 1: [0.022608746027542374]}
min-point [0.00011533]
2
{0: [0.01577683303886926], 1: [0.01171875], 2: [0.022608746027542374]}
{0: [0.01171875], 1: [0.022608746027542374], 2: [0.01577683303886926]}
min-point [2.88015428e-05]
0


{0: [0.01171875, 0.01171875],
 1: [0.022608746027542374, 0.022493414750957855],
 2: [0.01577683303886926, 0.015748031496062992]}

---

**Plot history values**

In [None]:
import matplotlib.pylab as plt
print(cluster_mean_history[0],cluster_mean_history[1],cluster_mean_history[2])
plt.plot(cluster_mean_history[0])
plt.plot(cluster_mean_history[1])
plt.plot(cluster_mean_history[2])
plt.show()

---

Alle Punkte in einem Cluster miteinander vergleichen.

https://stackoverflow.com/questions/16603282/how-to-compare-each-item-in-a-list-with-the-rest-only-once

Source: 

https://stackoverflow.com/questions/6105777/how-to-compare-a-list-of-lists-sets-in-python
https://stackoverflow.com/questions/1388818/how-can-i-compare-two-lists-in-python-and-return-matches

---