# DBSCAN
This notebook is split into three sections with one for every algorithm.
Essentially, the idea is to cluster movies and users so that we can
recommend the entire cluster they belong to, for both users and movies.

Compared to the other algorithms, this means that we will not be able
to give exactly how much they like a given movie, but just give them
the cluster they reside in.

## Initialise PySpark and data

In [23]:
from random import random
import time

from pyspark.ml.linalg import DenseVector
from pyspark.mllib.random import RandomRDDs
import pyspark.sql
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from sklearn.cluster import DBSCAN
import os
import numpy as np

from density.slides_dbscan import my_DBSCAN

if os.path.basename(os.getcwd()) == 'density':
    print("Current dir is", os.getcwd())
    print("Changing dir to be in root")
    os.chdir('..')
    print('now in', os.getcwd())

from proposal.useful_tools import pickle_manager

SPARK_CONF = SparkConf()
SPARK_CONF.set("spark.driver.memory", "10g")
SPARK_CONF.set("spark.cores.max", "4")
SPARK_CONF.set("spark.executor.heartbeatInterval", "3600")
SPARK_CONF.setAppName("word2vec")

SPARK_CONTEXT = SparkContext.getOrCreate(SPARK_CONF)
SPARK = SQLContext(SPARK_CONTEXT)

np.random.seed(42)

In [24]:
# Assume that I can do this
MOVIES: pyspark.sql.DataFrame = 1  # data frame

# Just so that we have something lets just go ahead and do this
MOVIES_SIMILARITY_MATRIX = pickle_manager.load_pickle('pickles/similarity_matrix.pickle.lz4')

# To keep things fair, initialise the necessary parameters right at the start
MOVIES_RADIUS = 0.8
MOVIES_MINIMUM_POINTS = 5

In [25]:
def crush(x):
    if x < 0:
        x = 0
    elif x > 1:
        x = 1
    return x


print(MOVIES_SIMILARITY_MATRIX)
crush_v = np.vectorize(crush)
MOVIES_SIMILARITY_MATRIX = crush_v(np.subtract(1.0, MOVIES_SIMILARITY_MATRIX))
print("Number of different distances", len(set(MOVIES_SIMILARITY_MATRIX.flatten())))

print("############")
print(MOVIES_SIMILARITY_MATRIX)

[[1.         0.05882353 0.06063391 ... 0.06063391 0.         0.        ]
 [0.05882353 1.         0.06063391 ... 0.         0.         0.070014  ]
 [0.06063391 0.06063391 1.         ... 0.         0.         0.        ]
 ...
 [0.06063391 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.070014   0.         ... 0.         0.         1.        ]]
Number of different distances 267
############
[[0.         0.94117647 0.93936609 ... 0.93936609 1.         1.        ]
 [0.94117647 0.         0.93936609 ... 1.         1.         0.929986  ]
 [0.93936609 0.93936609 0.         ... 1.         1.         1.        ]
 ...
 [0.93936609 1.         1.         ... 0.         1.         1.        ]
 [1.         1.         1.         ... 1.         0.         1.        ]
 [1.         0.929986   1.         ... 1.         1.         0.        ]]


## Scikit DBSCAN
[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html#sklearn.cluster.DBSCAN)
### Implementation

In [26]:
start = time.perf_counter()
scikit_movies_clustering = DBSCAN(
    eps=MOVIES_RADIUS, min_samples=MOVIES_MINIMUM_POINTS, metric='precomputed', n_jobs=-1
).fit(MOVIES_SIMILARITY_MATRIX)
print("Took", time.perf_counter() - start, "seconds")

Took 0.03553749999991851 seconds


In [27]:
def count_noise(labels):
    count = 0
    for i in labels:
        if i == -1:
            count += 1
    return count


print(scikit_movies_clustering.labels_)
print("Unique labels", set(scikit_movies_clustering.labels_))
print("Number of noise points", count_noise(scikit_movies_clustering.labels_))

print("Frequency table", np.unique(np.unique(scikit_movies_clustering.labels_, return_counts=True)))

[-1 -1 -1 ... -1 -1 -1]
Unique labels {0, 1, -1}
Number of noise points 1021
Frequency table [  -1    0    1    3  976 1021]


## PyClustering DBSCAN
### [Documentation](https://pyclustering.github.io/docs/0.10.1/html/d2/d42/classpyclustering_1_1cluster_1_1dbscan_1_1dbscan.html#details)

In [28]:
from pyclustering.cluster.dbscan import dbscan

start = time.perf_counter()
pyclustering_movies = dbscan(
    MOVIES_SIMILARITY_MATRIX, MOVIES_RADIUS, MOVIES_MINIMUM_POINTS, True, **{'data_type': 'distance_matrix'}
)
pyclustering_movies.process()
print("Took", time.perf_counter() - start, "seconds")

print("Clusters:", pyclustering_movies.get_clusters())
print("Noise:", pyclustering_movies.get_noise())
print("Number of noise points", len(pyclustering_movies.get_noise()))
print("Frequency table", np.unique(np.unique(pyclustering_movies.get_clusters(), return_counts=True)))

Took 1.0414111000009143 seconds
Clusters: [[4, 155, 269, 296, 431, 903, 954, 1036, 1159, 1406, 1835, 1865, 113, 352, 571, 1136, 53, 116, 338, 346, 428, 461, 565, 626, 714, 867, 979, 996, 1061, 1115, 1255, 1311, 1312, 1603, 1769, 1840, 1853, 1906, 1924, 6, 8, 17, 18, 32, 67, 96, 191, 256, 285, 306, 311, 312, 314, 496, 524, 538, 548, 573, 635, 636, 661, 671, 672, 680, 710, 713, 802, 876, 895, 907, 936, 986, 1008, 1104, 1106, 1118, 1125, 1132, 1187, 1199, 1213, 1237, 1243, 1245, 1254, 1266, 1270, 1274, 1380, 1401, 1454, 1459, 1468, 1543, 1558, 1566, 1653, 1654, 1702, 1771, 1778, 1790, 1797, 1817, 1845, 1855, 1864, 1879, 1882, 1884, 1900, 1908, 1921, 1945, 1962, 1979, 1992, 30, 82, 103, 241, 289, 458, 464, 528, 542, 638, 649, 678, 754, 1056, 1082, 1149, 1182, 1186, 1226, 1263, 1276, 1358, 1371, 1389, 1535, 1540, 1712, 1734, 1735, 1762, 1811, 1919, 73, 110, 196, 226, 351, 662, 685, 728, 830, 911, 917, 939, 1016, 1071, 1151, 1164, 1167, 1354, 1368, 1486, 1488, 1505, 1538, 1572, 1613, 1913, 1

## Slides
### Implementation
Since we made a class that just inherits the scikit dbscan and replaces the fit function, we should
just be able to do the same process here as the scikit section

In [29]:
start = time.perf_counter()
my_dbscan_movies_clustering = my_DBSCAN(
    eps=MOVIES_RADIUS, min_samples=MOVIES_MINIMUM_POINTS, metric='precomputed', n_jobs=-1
).fit(MOVIES_SIMILARITY_MATRIX.copy())
print("Took", time.perf_counter() - start, "seconds")

Took 0.8961123999997653 seconds


In [30]:
print(set(my_dbscan_movies_clustering.labels_))
print("Number of noise points", count_noise(my_dbscan_movies_clustering.labels_))
print("Frequency table", np.unique(np.unique(my_dbscan_movies_clustering.labels_, return_counts=True)))

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0}
Number of noise points 1807
Frequency table [-1.000e+00  0.000e+00  1.000e+00  2.000e+00  3.000e+00  4.000e+00
  5.000e+00  6.000e+00  7.000e+00  8.000e+00  9.000e+00  1.000e+01
  1.100e+01  1.200e+01  1.300e+01  1.400e+01  1.500e+01  1.600e+01
  2.100e+01  2.500e+01  1.807e+03]


## Evaluation

In [31]:
from sklearn import metrics


def evaluate_clustering_numerically(labels, similarity_matrix):
    """
    All of the prints that are needed to represent how well the data is clustered
    :param labels:
    :return:
    """
    print("Number of clusters", len(set(labels)) - (1 if -1 in labels else 0))
    print("Noise fraction", list(labels).count(-1) / len(labels))
    print("Silhouette Coefficient", metrics.silhouette_score(similarity_matrix, labels))

In [32]:
evaluate_clustering_numerically(
    scikit_movies_clustering.labels_,
    MOVIES_SIMILARITY_MATRIX
)

Number of clusters 2
Noise fraction 0.5105
Silhouette Coefficient -0.01368109605920541


In [33]:
evaluate_clustering_numerically(
    pyclustering_movies.get_clusters(),
    MOVIES_SIMILARITY_MATRIX
)

TypeError: unhashable type: 'list'

In [34]:
evaluate_clustering_numerically(
    my_dbscan_movies_clustering.labels_,
    MOVIES_SIMILARITY_MATRIX
)

Number of clusters 16
Noise fraction 0.9035
Silhouette Coefficient -0.2345095542966171
