# Using DBSCAN as clustering method

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from matplotlib import pyplot as plt

## Test

In [2]:
states = ["INITIAL","login","View_Items","home","logout","View_Items_quantity","Add_to_Cart","shoppingcart",
          "remove","deferorder","purchasecart","inventory","sellinventory","clearcart","cancelorder","$"]

### Makovchain & sparse matrix

In [3]:
def transition_matrix(sessions, states):
    sessionID = []
    markovchains = []
    for key, value in sessions.items():
        #print(key)
        # labelEncoding
        le = preprocessing.LabelEncoder()
        le.fit(value)
        transformed_s = le.transform(value)

        #factorize
        factorize = pd.factorize(value)[0]
        
        # matrix
        n = 1 + max(factorize)  # number of states
        M = [[0] * n for _ in range(n)]

        for (i, j) in zip(factorize, factorize[1:]):
            M[i][j] += 1
        
        # now convert to probabilities:
        for row in M:
            s = sum(row)
            if s > 0:
                row[:] = [f / s for f in row]
                
        # print Matrix style
        #for row in M: print(' '.join('{0:.2f}'.format(x) for x in row))
        
        # unique array in the right order
        value = np.array(value)
        _, idx = np.unique(value, return_index=True)
        
        df = pd.DataFrame(data = M, index=value[np.sort(idx)],
                          columns=value[np.sort(idx)])

        df_1 = pd.DataFrame(index=states, columns=states, dtype='float64')

        #merge = df_1.merge(df, how='right').fillna(0).round(2).set_index(value[np.sort(idx)])
        merge = df_1.update(df, join='left')
        #merge = pd.merge(df_1, df, how='right').fillna(0).round(2).set_index(value[np.sort(idx)])
        
        
        merge = pd.concat([pd.concat([df_1, df], axis=1, sort= False)], axis=0).fillna(0).round(2).iloc[:, :-n] 
        #merge = merge.iloc[:, :-n]        
        
        
        # convert into Vector
        merge = np.array(merge.values.flatten().tolist())
        #print(len(merge))
        # resize so the vectors got the same length
        #size = 16*16
        #merge.resize(size)
        
        # 2-D array 
        markovchains.append(merge)
        #print(len(markovchains))
        # csr sparse matrix
        csr = csr_matrix(markovchains)
        #print(csr)
        #markovchains.append(merge)
        sessionID.append(key)
    #print(sparse_sessions)
    #print(len(merge))
    return csr, sessionID

#m = transition_matrix(sessions, states)

## Backpropagate after clustering

In [4]:
#Data imports
PATH = "../../data/raw/"
sessions_file = (PATH+'sessions.dat')

In [5]:
def session_request_dict(sessions_file):
    s_r_dict = {}
    # Dict of sessions
    with open(sessions_file) as fn:
        sessions_raw = fn.readlines()

    for session in sessions_raw:
        key = re.search('([^.]+)', session).group()
        value = re.findall('\"(.*?)\"', session)
        s_r_dict[key] = value

    return s_r_dict

In [55]:
data = session_request_dict(sessions_file)

set_1 = {k: data[k] for k in list(data)[0:1000]}
set_2 = {k: data[k] for k in list(data)[500:1500]}

In [56]:
#Dict_Cluster
def cluster_dict(labels, X_):
    cluster_list =[]
    
    for label in np.unique(labels):
        points = X_[labels == label].toarray()
        
        for point in points:
            cluster_dict = {}
            cluster_dict[label] = point
            cluster_list.append(cluster_dict)
            
    return cluster_list

In [57]:
X_1, sparse_sessions = transition_matrix(set_1, states)
X_2, sparse_sessions2 = transition_matrix(set_2, states)

clustering_1 = DBSCAN(eps=1.5, min_samples=2).fit(X_1)
clustering_2 = DBSCAN(eps=1.5, min_samples=10).fit(X_2)

labels_1 = clustering_1.labels_
labels_2 = clustering_2.labels_

cluster_dict_1 = cluster_dict(labels_1, X_1)
cluster_dict_2 = cluster_dict(labels_2, X_2)

print(np.unique(labels_1, return_counts=True))
print(np.unique(labels_2, return_counts=True))

(array([0, 1, 2], dtype=int64), array([485, 261, 254], dtype=int64))
(array([0, 1, 2], dtype=int64), array([283, 481, 236], dtype=int64))


---

## With Cluster shifting

In [58]:
def list_cluster(cluster_dict_):
    cluster_list = []
    if np.unique(labels_1) in np.unique(labels_2):
        for cluster_index, value in enumerate(np.unique(labels_1)):
            tmp = []
            for item in cluster_dict_:
                for k,v in item.items():
                    if k == value:
                        tmp.append(v.tolist())
            cluster_list.append(np.mean(tmp, axis=0))
    return cluster_list

first_list = list_cluster(cluster_dict_1)

second_list = list_cluster(cluster_dict_2)

In [59]:
from tqdm import tqdm

mylist = [first_list, second_list]
old_min_points = {}
cluster_mean_history={}
min_point_label_list = []

for index, value in enumerate(first_list):

    min_point_dict = {}
    value_subtraction_sum_dict = {}
    second_list_dict = {}
    sum_mean_vector_list =[]
    
    for second_index, second_value in enumerate(second_list):

        # Indexing dict from second_list
        second_list_dict[second_index] = second_value
        # Min vector subtraction with first and second index
        min_point_dict[sum(abs(np.array(value)-np.array(second_value)))]=[index, second_index]
        # Sum of the subtraction and the abs vector subtraction (for matching) 
        value_subtraction_sum_dict[sum(abs(np.array(value)-np.array(second_value)))] = abs(np.array(value)-np.array(second_value))
        # sum mean vector as list for matching 
        sum_mean_vector_list.append(sum(abs(np.array(value)-np.array(second_value))))
    
    min_point_label_list.append(min_point_dict[min(sum_mean_vector_list)])
    #print(min_point_label_list)
    # Normal labled clusterpoints bevore compute the shifting
    old_min_points[min_point_dict[min(sum_mean_vector_list)][0]] = second_list_dict[
        value_subtraction_sum_dict[min(sum_mean_vector_list)][1]
    ]

    # Labeling the new cluster points to the shifting cluster
    cluster_mean_history[min_point_dict[min(sum_mean_vector_list)][0]] = second_list_dict[
        value_subtraction_sum_dict[min(sum_mean_vector_list)][1]
    ]


    # No. of cluster with corresponding clusterpoints
    tmp=[second_list_dict[min_point_dict[min(sum_mean_vector_list)][1]]]
    #tmp.append(value)
    cluster_mean_history[index]=tmp

cluster_mean_history

{0: [array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.93, 0.07, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0

In [60]:
min_point_label_list

[[0, 1], [1, 2], [2, 0]]

---

## Endresult

In [22]:
l = list(zip(labels_2, sparse_sessions2))
d = dict()
[d [t[0]].append(t[1]) if t[0] in list(d.keys()) 
 else d.update({t[0]: [t[1]]}) for t in l]
new_dict = dict([(str(k), v) for k, v in d.items()])

https://stackoverflow.com/questions/6586310/how-to-convert-list-of-key-value-tuples-into-dictionary

**Update the shifting key**

In [21]:
print (min_point_label_list)
for key_pair in min_point_label_list:
    new_dict[str(key_pair[0])] = new_dict.pop(str(key_pair[1]))
new_dict

[[0, 1], [1, 2], [2, 0]]


{'1': ['zVl+gTUcS5cD7GT7niUzXZDq',
  'XmrG5QadqBisU1+99nBT2bZS',
  '2-IfBZ-ee2vuDJBqA6WXoryv',
  'tXHfJ2-oBOSUdv4sJoMoEXdp',
  'KJVkcBwUAzVxaD0iDd3EkR6T',
  'fNnfACIcmu70nnG1JsKUjhC1',
  '64YZxIrQtaEtOxab57Pq6q3E',
  'lLG0LEZjH3TzuVoHZLJtUq5j',
  'yJ349DkjdoJfuzChj-vUB5rp',
  'UVLyEeXAzgrPG6YnxC26DNgj',
  '8TL6TC0p-AAc6eWyVyAip+Rv',
  'HbJu9nOJScmLd7EWvjRbjCDY',
  'KCrms2i0tWsQdlsM+478ruP4',
  'KsW-EV6G6G+i8+yumrmhPxRG',
  'SjwPe3Tnwlojy64S-be0KwdV',
  'z0cmiUToTU1ZVhso8BB5CTJa',
  '5Y0eOCCkGZRXinNG58vz0LF0',
  'D+5+57vOLPPTkf9SGZ3-vQqx',
  'EbftN9Pu4BZq2ekagC6LQULq',
  'qTneaKXIvwHG8ZKgtrYyYSEE',
  'jT0MH61pa0cS-YCOvtpPJg-o',
  'f5R2X+hWpwiJYdqeERT+3b7V',
  'vJF7WxZZn313KKgoIdtGt+vV',
  'X5OQ22YoxTr0PxCapblBch8z',
  'saZxqMGl3wrqQjd3MqPi6o1l',
  'G9nVy4r7+tAZKJCC9Fvbrr77',
  'EpVS4ngxxTyHkGpxLRdoUCab',
  'Dq64BoTPyInELGxXTsLVuZ80',
  'O39MA558e+zn84B0XWU1Tm61',
  'UlnzQmO8U4SU8K8QYH-vgqAw',
  'O8YEOChW73Frh3zmACJ7hERo',
  '4fH6O+kvKCqgr-AZOnBiZBHR',
  'bJj6I6ONVRWtq4VDOx8q0veQ',
  '3R

https://stackoverflow.com/questions/4406501/change-the-name-of-a-key-in-dictionary