<a href="https://colab.research.google.com/github/Melvinmcrn/DataScience/blob/master/PySpark/5_Pyspark_Clustering_Pipeline_Cdr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# Pyspark_Clustering_Pipeline_Cdr

In [0]:
#1 - import module
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler,MaxAbsScaler

In [0]:
#2 - Create SparkContext
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

In [0]:
sc

In [0]:
sc._conf.getAll()

[(u'spark.app.id', u'local-1563876521128'),
 (u'spark.driver.host', u'4db6918c56a8'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.driver.port', u'41123'),
 (u'spark.master', u'local[*]'),
 (u'spark.executor.id', u'driver'),
 (u'spark.submit.deployMode', u'client'),
 (u'spark.ui.showConsoleProgress', u'true'),
 (u'spark.app.name', u'pyspark-shell')]

In [0]:
print  (sc.getConf().toDebugString())

spark.app.id=local-1563876521128
spark.app.name=pyspark-shell
spark.driver.host=4db6918c56a8
spark.driver.port=41123
spark.executor.id=driver
spark.master=local[*]
spark.rdd.compress=True
spark.serializer.objectStreamReset=100
spark.submit.deployMode=client
spark.ui.showConsoleProgress=true


In [0]:
#3 - Setup SparkSession(SparkSQL)
spark = (SparkSession
         .builder
         .appName("Pyspark_Clustering_Pipeline_Cdr")
         .getOrCreate())
print (spark)

<pyspark.sql.session.SparkSession object at 0x7f7a68334450>


In [0]:
#4 - Read file to spark DataFrame

data = (spark
        .read
        .option("header","true")
        .option("inferSchema", "true")
        .csv("cdr_extractFeatures.csv"))
data.cache()
print ("finish caching data")

finish caching data


In [0]:
data.describe().toPandas()

Unnamed: 0,summary,uniquePN,no_CallIn_Unique,no_CallOut_Unique,no_CallIn,no_CallOut,avg_CallIn_Length,avg_CallOut_Length,avg_Call_Length
0,count,501,501.0,501.0,501.0,501.0,501.0,501.0,501.0
1,mean,,19.56087824351297,19.56087824351297,19.960079840319363,19.960079840319363,305.5274154043913,306.4270003514968,305.8882365878245
2,stddev,,4.306133582129764,4.322821581692413,4.407993102718377,4.397999908323414,38.40930235550977,38.123520925514285,27.72859975460591
3,min,089-1000000,9.0,7.0,9.0,7.0,177.6315789,176.1666667,216.9090909
4,max,089-1000500,33.0,33.0,33.0,34.0,421.0714286,437.2857143,384.975


In [0]:
data.printSchema()

root
 |-- uniquePN: string (nullable = true)
 |-- no_CallIn_Unique: integer (nullable = true)
 |-- no_CallOut_Unique: integer (nullable = true)
 |-- no_CallIn: integer (nullable = true)
 |-- no_CallOut: integer (nullable = true)
 |-- avg_CallIn_Length: double (nullable = true)
 |-- avg_CallOut_Length: double (nullable = true)
 |-- avg_Call_Length: double (nullable = true)



In [0]:
data.toPandas()

Unnamed: 0,uniquePN,no_CallIn_Unique,no_CallOut_Unique,no_CallIn,no_CallOut,avg_CallIn_Length,avg_CallOut_Length,avg_Call_Length
0,089-1000000,13,25,15,26,304.466667,241.692308,264.658537
1,089-1000001,12,19,12,20,271.083333,314.500000,298.218750
2,089-1000002,17,31,18,31,306.055556,263.032258,278.836735
3,089-1000003,16,14,16,14,300.250000,311.142857,305.333333
4,089-1000004,22,16,24,16,308.750000,306.187500,307.725000
5,089-1000005,26,19,26,20,320.115385,280.650000,302.956522
6,089-1000006,22,17,22,17,284.818182,246.941177,268.307692
7,089-1000007,22,18,22,20,280.681818,289.200000,284.738095
8,089-1000008,17,21,17,21,279.529412,363.666667,334.054054
9,089-1000009,15,23,15,23,352.800000,274.434783,305.368421


In [0]:
#5 - Print sample 5 rows of all variables
column_name = ["no_CallIn_Unique","no_CallOut_Unique","no_CallIn","no_CallOut"
               ,"avg_CallIn_Length","avg_CallOut_Length","avg_Call_Length"]

In [0]:
#6 - Create Vector
assem =  VectorAssembler(inputCols=column_name ,outputCol="temp_features")

print (assem)

VectorAssembler_1434726a9d65


In [0]:
#7 - Normalize
scaler = MaxAbsScaler(inputCol="temp_features", outputCol="features")

print (scaler)

MaxAbsScaler_795be4a57868


In [0]:
#8 - Create model
kmeans = KMeans().setK(3).setSeed(50)

In [0]:
#9 - Set ML pipeline
all_process_list = [assem,scaler,kmeans]
for process in all_process_list: print process

pipeline = Pipeline(stages=all_process_list)
print (pipeline)

VectorAssembler_1434726a9d65
MaxAbsScaler_795be4a57868
KMeans_0f33196fbedd
Pipeline_8b5f84a4a3ab


In [0]:
#10 - Train model
model = pipeline.fit(data)

In [0]:
#11 - Make predictions
predictions = model.transform(data).select("features","prediction")
predictions.cache()

DataFrame[features: vector, prediction: int]

In [0]:
# Print sample result
predictions.sample(False, 0.3, 1234).toPandas()

Unnamed: 0,features,prediction
0,"[0.6666666666666666, 0.5151515151515151, 0.666...",2
1,"[0.7878787878787878, 0.5757575757575758, 0.787...",2
2,"[0.48484848484848486, 0.6060606060606061, 0.48...",1
3,"[0.6060606060606061, 0.3939393939393939, 0.636...",2
4,"[0.45454545454545453, 0.6060606060606061, 0.45...",1
5,"[0.7272727272727273, 0.5757575757575758, 0.727...",2
6,"[0.5757575757575758, 0.7272727272727273, 0.575...",0
7,"[0.45454545454545453, 0.696969696969697, 0.484...",1
8,"[0.5757575757575758, 0.3939393939393939, 0.575...",2
9,"[0.7272727272727273, 0.7878787878787878, 0.727...",0


In [0]:
#12 Evaluate clustering by computing Within Set Sum of Squared Errors.
kmean_model = model.stages[-1]
wssse = kmean_model.computeCost(predictions)
print("Within Set Sum of Squared Errors = " + str(wssse))

Within Set Sum of Squared Errors = 26.2711766949


In [0]:
#13 Shows Cluster's Center
centers = kmean_model.clusterCenters()
scaler_model = model.stages[-2]
max = scaler_model.maxAbs
print("Cluster Centers: ")
for center in centers:
    print(center*max)

Cluster Centers: 
[ 22.18238994  23.38993711  22.66666667  23.88050314 302.40377844
 302.09128663 301.85087841]
[ 14.83225806  20.07096774  15.05806452  20.42580645 305.27021807
 308.2657195  307.03127068]
[ 21.2513369   15.88235294  21.72192513  16.24064171 308.39652698
 308.58944428 308.37363587]
