In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import math

In [2]:
data = pd.read_csv("drug_consumption.csv")
data.head()

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstacy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
0,1,0.49788,0.48246,-0.05921,0.96082,0.126,0.31287,-0.57545,-0.58331,-0.91699,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL2,CL0,CL0
1,2,-0.07854,-0.48246,1.98437,0.96082,-0.31685,-0.67825,1.93886,1.43533,0.76096,...,CL4,CL0,CL2,CL0,CL2,CL3,CL0,CL4,CL0,CL0
2,3,0.49788,-0.48246,-0.05921,0.96082,-0.31685,-0.46725,0.80523,-0.84732,-1.6209,...,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0,CL0
3,4,-0.95197,0.48246,1.16365,0.96082,-0.31685,-0.14882,-0.80615,-0.01928,0.59042,...,CL0,CL0,CL2,CL0,CL0,CL0,CL0,CL2,CL0,CL0
4,5,0.49788,0.48246,1.98437,0.96082,-0.31685,0.73545,-1.6334,-0.45174,-0.30172,...,CL1,CL0,CL0,CL1,CL0,CL0,CL2,CL2,CL0,CL0


In [3]:
#Source: https://drugs.laws.com/list-of-illegal-drugs
#Grouping Illegal and Non-Illegal drugs
illegal_drugs = ['Amphet','Coke','Crack','Ecstacy','Heroin','LSD','Mushrooms']
Non_illegal =[i for i in data.columns[13:] if i not in illegal_drugs]

In [4]:
#Stripping all 'CL' from all the drug columns
for drug in data.columns[13:]:
        data[drug] = data[drug].map(lambda x: str(x).lstrip('CL'))

In [5]:
#Start with Clusters
#Cluster1: Using all the columns
#Cluster1 with Hierarchial clustering single linkage

In [6]:
variables = data.columns
var_indices = [data.columns.get_loc(variable) for variable in variables]

In [7]:
print(var_indices)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]


In [8]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [9]:
clustering = linkage(x_scaled,method="single",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [10]:
data['clusters'] = clusters

In [11]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.5079507583610615


In [12]:
#Cluster1 with Hierarchial clustering Complete Linkage

In [13]:
clustering = linkage(x_scaled,method="complete",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [14]:
data['clusters'] = clusters

In [15]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.6629990546490399


In [16]:
#Cluster1 with K-Means

In [17]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[0 1 0 ... 1 1 1]


In [18]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.18301116784616245


In [19]:
#Cluster1 with DBSCAN

In [20]:
clustering = DBSCAN(eps = 2, min_samples = 4, metric = "euclidean").fit(x_scaled) 
clusters = clustering.labels_
data['clusters'] = clusters

In [21]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.1898724108691829


In [22]:
# print(metrics.adjusted_rand_score(data['clusters']))
# We cannot use rand index cause we do not have anything to compare the clusters against

In [23]:
#Cluster2: Using only Non-illegal drugs

In [24]:
variables = Non_illegal
var_indices = [data.columns.get_loc(variable) for variable in variables]

In [25]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [26]:
clustering = linkage(x_scaled,method="single",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[0 0 0 ... 0 0 0]


In [27]:
data['clusters'] = clusters

In [28]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.8233760416358061


In [29]:
#Cluster2 with complete Linkage

In [30]:
clustering = linkage(x_scaled,method="complete",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [31]:
data['clusters'] = clusters

In [32]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.7931811284335685


In [33]:
#Cluster2 with K-Means

In [34]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[0 1 0 ... 1 1 1]


In [35]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.22503062682481748


In [36]:
#Cluster2 with DBSCAN

In [37]:
clustering = DBSCAN(eps = 2, min_samples = 3, metric = "euclidean").fit(x_scaled)
clusters = clustering.labels_
data['clusters'] = clusters

In [38]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.09401786652049447


In [39]:
#Cluster3: Using only Illegal drugs

In [40]:
variables = illegal_drugs
var_indices = [data.columns.get_loc(variable) for variable in variables]
print(var_indices)

[14, 20, 21, 22, 23, 26, 28]


In [41]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [42]:
clustering = linkage(x_scaled,method="single",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[0 0 0 ... 0 0 0]


In [43]:
data['clusters'] = clusters

In [44]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.6082079307311252


In [45]:
#Cluster3 using Hierarchial clustering Complete Linkage

In [46]:
clustering = linkage(x_scaled,method="complete",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [47]:
data['clusters'] = clusters

In [48]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.5415889060633275


In [49]:
#Cluster3 with K-Means

In [50]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[0 1 0 ... 1 1 1]


In [51]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.4487703444541595


In [52]:
#Cluster3 with DBSCAN

In [53]:
clustering = DBSCAN(eps = 2, min_samples = 3, metric = "euclidean").fit(x_scaled)
clusters = clustering.labels_
data['clusters'] = clusters

In [54]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.5160039017438653


In [55]:
#cluster4: Using only the drugs

In [56]:
variables = illegal_drugs + Non_illegal
var_indices = [data.columns.get_loc(variable) for variable in variables]
print(var_indices)

[14, 20, 21, 22, 23, 26, 28, 13, 15, 16, 17, 18, 19, 24, 25, 27, 29, 30, 31]


In [57]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [58]:
clustering = linkage(x_scaled,method="single",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[0 0 0 ... 0 0 0]


In [59]:
data['clusters'] = clusters

In [60]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.7799966615627284


In [61]:
#Cluster4 using Hierarchial clustering Complete Linkage

In [62]:
clustering = linkage(x_scaled,method="complete",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [63]:
data['clusters'] = clusters

In [64]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.7432570016576694


In [65]:
#Cluster4 with K-Means

In [66]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[0 1 0 ... 1 1 1]


In [67]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.2684663279061382


In [68]:
#Cluster4 with DBSCAN

In [69]:
clustering = DBSCAN(eps = 2, min_samples = 3, metric = "euclidean").fit(x_scaled)
clusters = clustering.labels_
data['clusters'] = clusters

In [70]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.06633855889724637


In [71]:
#Cluster5: Not using any drugs

In [72]:
variables = data.columns[0:13]
var_indices = [data.columns.get_loc(variable) for variable in variables]
print(var_indices)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [73]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [74]:
#Cluster5 with Hierarchial clustering with Single Linkage

In [75]:
clustering = linkage(x_scaled,method='single',metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[0 0 0 ... 0 0 0]


In [76]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.21114670320992815


In [77]:
#Cluster5 with Hierarchial Clustering with Complete Linkage

In [78]:
clustering = linkage(x_scaled,method='complete',metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [79]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.21114670320992815


In [80]:
#Cluster 5 with K-Means

In [81]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[0 0 0 ... 1 1 1]


In [82]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.15804112591503822


In [83]:
#Cluster5 with DBSCAN

In [84]:
clustering = DBSCAN(eps = 2, min_samples = 3, metric = "euclidean").fit(x_scaled)
clusters = clustering.labels_
data['clusters'] = clusters

In [85]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.11510965619210076


In [86]:
#Cluster6: Using Personality traits

In [87]:
variables = data.columns[6:13]
var_indices = [data.columns.get_loc(variable) for variable in variables]
print(var_indices)

[6, 7, 8, 9, 10, 11, 12]


In [88]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [89]:
#Cluster6 with Hierarchial clustering with Single Linkage

In [90]:
clustering = linkage(x_scaled,method="single",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[0 0 0 ... 0 0 0]


In [91]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.32290072471567954


In [92]:
#Cluster6 with Hierarchial Clustering with Complete Linkage

In [93]:
clustering = linkage(x_scaled,method='complete',metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 0 ... 0 0 1]


In [94]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.32290072471567954


In [95]:
#Cluster6 with K-Means

In [96]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[1 1 1 ... 0 0 0]


In [97]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.1898276389135005


In [98]:
#Cluster6 with DBSCAN

In [99]:
clustering = DBSCAN(eps = 2, min_samples = 3, metric = "euclidean").fit(x_scaled)
clusters = clustering.labels_
data['clusters'] = clusters

In [100]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.3122595933194069


In [101]:
#Cluster7: Using Personality traits and Non_illegal drugs

In [102]:
variables = Non_illegal
var_indices = [data.columns.get_loc(variable) for variable in variables] + [6,7,8,9,10,11,12]
print(var_indices)

[13, 15, 16, 17, 18, 19, 24, 25, 27, 29, 30, 31, 6, 7, 8, 9, 10, 11, 12]


In [103]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [104]:
#Cluster7 with Hierarchial Clustering with Single Linkage

In [105]:
clustering = linkage(x_scaled,method="single",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[0 0 0 ... 0 0 0]


In [106]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.2200681697633267


In [107]:
#Cluster7 with Hierarchial Clustering with Complete Linkage

In [108]:
clustering = linkage(x_scaled,method='complete',metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [109]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.2200681697633267


In [110]:
#Cluster7 with K-Means

In [111]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[0 1 0 ... 1 1 1]


In [112]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.1663782954936895


In [113]:
#Cluster7 with DBSCAN

In [114]:
clustering = DBSCAN(eps = 2, min_samples = 3, metric = "euclidean").fit(x_scaled)
clusters = clustering.labels_
data['clusters'] = clusters

In [115]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.211881563006982


In [116]:
#Cluster8: Using Personality traits and Illegal drugs

In [117]:
variables = illegal_drugs
var_indices = [data.columns.get_loc(variable) for variable in variables] + [6,7,8,9,10,11,12]
print(var_indices)

[14, 20, 21, 22, 23, 26, 28, 6, 7, 8, 9, 10, 11, 12]


In [118]:
#Standardizing the data
x = data.iloc[:,var_indices]
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [119]:
#Cluster8 with Hierarchial Clustering with Single Linkage

In [120]:
clustering = linkage(x_scaled,method="single",metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[0 0 0 ... 0 0 0]


In [121]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.2673306540216515


In [122]:
#Cluster8 with Hierarchial Clustering with Complete Linkage

In [123]:
clustering = linkage(x_scaled,method='complete',metric="euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
clusters = clusters - 1
print(clusters)

[1 1 1 ... 1 1 1]


In [124]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.2673306540216515


In [125]:
#Cluster8 with K-Means

In [126]:
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(x_scaled)
clusters = clustering.labels_
print(clusters)
data['clusters'] = clusters

[0 1 0 ... 1 1 1]


In [127]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

0.2401118477537548


In [128]:
#Cluster8 with DBSCAN

In [129]:
clustering = DBSCAN(eps = 2, min_samples = 3, metric = "euclidean").fit(x_scaled)
clusters = clustering.labels_
data['clusters'] = clusters

In [130]:
#Silhouette coefficient
print(metrics.silhouette_score(x_scaled, data['clusters'], metric = "euclidean"))

-0.11209916794288363
