In [8]:
from sklearn.cluster import DBSCAN
from itertools import product
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from kneed import KneeLocator

def remove_outliers(df_):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numerical_frame = df_.select_dtypes(include=numerics)
    for colname in numerical_frame.columns:
        Q1 = df_[colname].quantile(0.25)
        Q3 = df_[colname].quantile(0.75)
        IQR = Q3 - Q1
        df_ = df_[~((df_[colname]<(Q1-1.5*IQR))|(df_[colname]>(Q3+1.5*IQR)))]
    return df_

In [9]:
raw_data = pd.read_csv('Model_data.csv', index_col='TOTOID')
raw_data.rename(columns={'Lag':'DepositLag', 'AVGLag':'BetsLag'}, inplace=True)
raw_data.drop(columns=['UserID', 'RegDate'], inplace=True)

In [10]:
raw_data = remove_outliers(raw_data)
raw_data['One_game'] = raw_data['One_game'].astype(int)
raw_data = pd.get_dummies(raw_data, prefix=['main', 'second'])
columns_to_scale = ['Age', 'DepositCount', 'DepositAmount', 'DepositLag', 'BetsLag']
minmax = raw_data.copy()
scaler = MinMaxScaler()
scaler.fit(minmax[columns_to_scale])
minmax[columns_to_scale] = scaler.transform(minmax[columns_to_scale])
minmax.drop_duplicates(keep='first', inplace=True)
minmax.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,Age,DepositCount,DepositAmount,DepositLag,BetsLag,One_game,main_Betongames,main_Live Casino,main_Other,main_P2P,...,main_TVGames,main_Virtual Games,second_Betongames,second_Live Casino,second_Other,second_P2P,second_Slots,second_Sports,second_TVGames,second_Virtual Games
count,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,...,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0
mean,0.34964,0.17674,0.19998,0.21346,0.32276,0.94345,0.03755,0.00212,0.00022,0.00351,...,0.00027,0.0001,0.81628,0.02071,0.03154,0.04112,0.04391,0.02964,0.00112,0.00385
std,0.24861,0.20186,0.23535,0.22211,0.17417,0.23098,0.1901,0.04596,0.01498,0.05915,...,0.01651,0.00981,0.38726,0.14243,0.17476,0.19858,0.20491,0.16961,0.03348,0.06191
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.14894,0.03311,0.02784,0.05809,0.21298,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.29787,0.09934,0.0985,0.12164,0.25606,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.48936,0.25166,0.29146,0.29043,0.36481,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
standard = raw_data.copy()
scaler = StandardScaler()
scaler.fit(standard[columns_to_scale])
standard[columns_to_scale] = scaler.transform(standard[columns_to_scale])
standard.drop_duplicates(keep='first', inplace=True)
standard.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,Age,DepositCount,DepositAmount,DepositLag,BetsLag,One_game,main_Betongames,main_Live Casino,main_Other,main_P2P,...,main_TVGames,main_Virtual Games,second_Betongames,second_Live Casino,second_Other,second_P2P,second_Slots,second_Sports,second_TVGames,second_Virtual Games
count,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,...,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0
mean,0.0,1e-05,1e-05,-5e-05,0.0,0.94345,0.03755,0.00212,0.00022,0.00351,...,0.00027,0.0001,0.81628,0.02071,0.03154,0.04112,0.04391,0.02964,0.00112,0.00385
std,1.00002,1.00001,1.00001,0.99994,1.00002,0.23098,0.1901,0.04596,0.01498,0.05915,...,0.01651,0.00981,0.38726,0.14243,0.17476,0.19858,0.20491,0.16961,0.03348,0.06191
min,-1.4064,-0.87552,-0.84971,-0.96102,-1.85315,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.80732,-0.71148,-0.73141,-0.69952,-0.63033,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,-0.20824,-0.38341,-0.43117,-0.4134,-0.38296,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.56201,0.37116,0.3887,0.34648,0.24139,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.61599,4.07838,3.39924,3.54096,3.88837,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
standard_full = raw_data.copy()
scaler = StandardScaler()
scaler.fit(standard_full[standard_full.columns])
standard_full[standard_full.columns] = scaler.transform(standard_full[standard_full.columns])
standard_full.drop_duplicates(keep='first', inplace=True)
standard_full.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,Age,DepositCount,DepositAmount,DepositLag,BetsLag,One_game,main_Betongames,main_Live Casino,main_Other,main_P2P,...,main_TVGames,main_Virtual Games,second_Betongames,second_Live Casino,second_Other,second_P2P,second_Slots,second_Sports,second_TVGames,second_Virtual Games
count,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,...,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0,62372.0
mean,0.0,1e-05,1e-05,-5e-05,0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,1.00002,1.00001,1.00001,0.99994,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,...,1.00002,1.00002,1.00001,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002
min,-1.4064,-0.87552,-0.84971,-0.96102,-1.85315,-4.08466,-0.19752,-0.04605,-0.01498,-0.05936,...,-0.01651,-0.00981,-2.10788,-0.14544,-0.18045,-0.20709,-0.21431,-0.17479,-0.03352,-0.06215
25%,-0.80732,-0.71148,-0.73141,-0.69952,-0.63033,0.24482,-0.19752,-0.04605,-0.01498,-0.05936,...,-0.01651,-0.00981,0.47441,-0.14544,-0.18045,-0.20709,-0.21431,-0.17479,-0.03352,-0.06215
50%,-0.20824,-0.38341,-0.43117,-0.4134,-0.38296,0.24482,-0.19752,-0.04605,-0.01498,-0.05936,...,-0.01651,-0.00981,0.47441,-0.14544,-0.18045,-0.20709,-0.21431,-0.17479,-0.03352,-0.06215
75%,0.56201,0.37116,0.3887,0.34648,0.24139,0.24482,-0.19752,-0.04605,-0.01498,-0.05936,...,-0.01651,-0.00981,0.47441,-0.14544,-0.18045,-0.20709,-0.21431,-0.17479,-0.03352,-0.06215
max,2.61599,4.07838,3.39924,3.54096,3.88837,0.24482,5.06284,21.71457,66.7399,16.84661,...,60.56402,101.95342,0.47441,6.87578,5.54163,4.82876,4.66607,5.72131,29.83359,16.08998


In [6]:
eps_values = np.arange(8, 12.75, 0.25)
min_samples = np.arange(3, 10)

DBSCAN_params = list(product(eps_values, min_samples))

In [None]:
from sklearn.metrics import silhouette_score

no_of_clusters = []
sil_score = []

for p in DBSCAN_params:
    DBS_clustering = DBSCAN(eps=p[0], min_samples=p[1]).fit(minmax)
    no_of_clusters.append(len(np.unique(DBS_clustering.labels_)))
    sil_score.append(silhouette_score(minmax, DBS_clustering.labels_))

Exception ignored in: Exception ignored in: 'sklearn.metrics._pairwise_distances_reduction.FastEuclideanPairwiseDistancesRadiusNeighborhood._compute_and_reduce_distances_on_chunks'
Exception ignored in: 'sklearn.metrics._pairwise_distances_reduction.FastEuclideanPairwiseDistancesRadiusNeighborhood._compute_and_reduce_distances_on_chunks'
Exception ignored in: 'sklearn.metrics._pairwise_distances_reduction.FastEuclideanPairwiseDistancesRadiusNeighborhood._compute_and_reduce_distances_on_chunks'
Exception ignored in: 'sklearn.metrics._pairwise_distances_reduction.FastEuclideanPairwiseDistancesRadiusNeighborhood._compute_and_reduce_distances_on_chunks'
Exception ignored in: 'sklearn.metrics._pairwise_distances_reduction.FastEuclideanPairwiseDistancesRadiusNeighborhood._compute_and_reduce_distances_on_chunks'
Exception ignored in: 'sklearn.metrics._pairwise_distances_reduction.FastEuclideanPairwiseDistancesRadiusNeighborhood._compute_and_reduce_distances_on_chunks'
Exception ignored in: 's

MemoryError: bad allocation

Exception ignored in: 'sklearn.metrics._pairwise_distances_reduction.FastEuclideanPairwiseDistancesRadiusNeighborhood._compute_and_reduce_distances_on_chunks'
Traceback (most recent call last):
  File "C:\Users\narek.meloyan\PycharmProjects\Segmentation_final\venv\lib\site-packages\sklearn\neighbors\_base.py", line 1097, in radius_neighbors
    results = PairwiseDistancesRadiusNeighborhood.compute(
MemoryError: bad allocation
