In [2]:
import os
import sys

os.chdir("C:/dataanalytics/python")
os.curdir

#Configure the environment . Set this up to the directory where spark is installed
if 'SPARK_HOME' not in os.environ:
    os.environ['SPARK_HOME'] = 'C:\\spark'
    
#create a variable for our root path
SPARK_HOME = os.environ['SPARK_HOME']

#Add the following paths to the system path. Please check your installation
#to make sure that these zip files actually exists. The names might change as
#versions change
sys.path.insert(0,os.path.join(SPARK_HOME,"python"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","py4j-0.10.6-src.zip"))
sys.path.insert(0,os.path.join(SPARK_HOME,"python","lib","pyspark.zip"))
 
#Initialize a spark context
from pyspark import SparkContext
from pyspark import SparkConf

#optionally configure spark
conf = (SparkConf().setAppName("V2Maestros").setMaster("local[2]").set("spark.executor.memory", "1g"))

#Initalize spark context onl runs once
sc = SparkContext(conf=conf)

In [3]:
from pyspark.sql import DataFrame,SparkSession,Row

In [4]:
spark = SparkSession.builder.appName("kmin").master("local").config(conf=conf).getOrCreate()

In [5]:
from pyspark.ml.clustering import KMeans

In [6]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [7]:
dataset = spark.read.format('libsvm').load(r'C:\spark\data\mllib\sample_kmeans_data.txt')

In [8]:
dataset.collect()

[Row(label=0.0, features=SparseVector(3, {})),
 Row(label=1.0, features=SparseVector(3, {0: 0.1, 1: 0.1, 2: 0.1})),
 Row(label=2.0, features=SparseVector(3, {0: 0.2, 1: 0.2, 2: 0.2})),
 Row(label=3.0, features=SparseVector(3, {0: 9.0, 1: 9.0, 2: 9.0})),
 Row(label=4.0, features=SparseVector(3, {0: 9.1, 1: 9.1, 2: 9.1})),
 Row(label=5.0, features=SparseVector(3, {0: 9.2, 1: 9.2, 2: 9.2}))]

In [9]:
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

In [12]:
#make the predictions
predicions = model.transform(dataset)

In [13]:
#evaluaing clusering by computing silhoutte score
evaluator = ClusteringEvaluator()

In [14]:
silhoutte = evaluator.evaluate(predicions)

In [16]:
print("silhoutte with squared euclidean distance" + str(silhoutte))

silhoutte with squared euclidean distance0.9997530305375207


In [17]:
#show the result
centers = model.clusterCenters()
print("cluster centers")
for center in centers:
    print(center)

cluster centers
[0.1 0.1 0.1]
[9.1 9.1 9.1]


In [18]:
#latent dirichlet aloaction (LDA)
from pyspark.ml.clustering import LDA

In [21]:
dataset1 = spark.read.format('libsvm').load(r'C:\spark\data\mllib\sample_lda_libsvm_data.txt')

In [22]:
#train a LDA model
lda = LDA(k=10,maxIter=10)
model = lda.fit(dataset1)

In [23]:
ll = model.logLikelihood(dataset1)
lp = model.logPerplexity(dataset)
print("The lower bound on loglikkelihood of entire model" + str(ll))
print("The upper bound onperpelexity" + str(lp))

The lower bound on loglikkelihood of entire model-799.9808070058632
The upper bound onperpelexity3.0768495179280353


In [24]:
#describing the topics
topics = model.describeTopics(3)
print("The topics describedby their weighted terms:")
topics.show(truncate = False)

The topics describedby their weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[9, 6, 3]  |[0.10230548588903822, 0.09793263362531258, 0.09638812273790025]|
|1    |[1, 2, 8]  |[0.10532477491044324, 0.09935871961455559, 0.09751335516923063]|
|2    |[2, 6, 1]  |[0.1141646090412642, 0.10062367357233215, 0.09736322073742838] |
|3    |[7, 10, 6] |[0.10519470001763491, 0.09771704205895608, 0.09622747973468669]|
|4    |[3, 10, 4] |[0.2803230414835252, 0.11595716772210499, 0.09795837982532582] |
|5    |[6, 2, 0]  |[0.10202968032702753, 0.09900473295912898, 0.09653851857600083]|
|6    |[2, 3, 5]  |[0.09752632253653487, 0.09695412216736887, 0.09639748122128558]|
|7    |[8, 0, 4]  |[0.1010520731252187, 0.09685896522925336, 0.09379522771597208] |
|8    |[9, 4, 0]  |[0.187912438

In [25]:
#show the result
transformed =model.transform(dataset)
transformed.show(truncate=False)

+-----+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                       |topicDistribution                                                                                                                                                                                                     |
+-----+---------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(11,[0,1,2,4,5,6,7,10],[1.0,2.0,6.0,2.0,3.0,1.0,1.0,3.0])      |[0.004783113277647845,0.004783145866161664,0.004783163765810139,0.00

In [26]:
#Bisecting means
from pyspark.ml.clustering import BisectingKMeans

In [27]:
#Loads data.
dataset2 = spark.read.format("libsvm").load(r'C:\spark\data\mllib\sample_kmeans_data.txt')

In [28]:
# Trains a bisecting k-means model.
bkm = BisectingKMeans().setK(2).setSeed(1)
model = bkm.fit(dataset)

In [29]:
# Evaluate clustering.
cost = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(cost))

Within Set Sum of Squared Errors = 432.1250000000001


In [30]:
# Shows the result.
print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)

Cluster Centers: 
[2.375 1.875 1.125 4.25  2.75  2.25  1.25  1.25  0.125 1.25  1.625]
[1.75 3.5  0.75 1.5  0.75 1.   5.25 0.   1.75 3.5  5.  ]


In [31]:
#Gaussian mixture model
from pyspark.ml.clustering import GaussianMixture

In [32]:
dataset3 = spark.read.format("libsvm").load(r'C:\spark\data\mllib\sample_kmeans_data.txt')

gmm = GaussianMixture().setK(2).setSeed(538009335)
model = gmm.fit(dataset)

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Gaussians shown as a DataFrame: 
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------