### 1. Py. SP: Import Library 

In [1]:
# Import Library
# Python
import random
import os
import numpy as np
import pandas as pd # for data manipulation
import matplotlib.pyplot as plt # for graph

# SPARK
import pyspark
import findspark # to find location where spark installed
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

### 2. Py.SP: Make Session

In [2]:
# SPARK: Make Session
sc = SparkContext()
spark = SparkSession(sc)

In [3]:
# Set Target Animal
target_animal = 'alligator'

### 3. Py.SP: Make LBP Features From Images

In [4]:
# SPARK: read image files in directory and make it to dataframe
img_dir = "D:\\Data\\AnimalsOnTheWeb\\" + target_animal
imgs = spark.read.format("image").load(img_dir)
imgs.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [5]:
# Python: Make Features from Images and save it to CSV file
#!python lbp.py

In [6]:
# SPARK: Read Feature CSV file and make DataFrame
import pyspark.sql.types as typ
res_lbp = spark.read.csv('Res_LBP.csv',header=True)
labels =[
    ('ind',typ.IntegerType()), # index
    ('Animal',typ.StringType()), # Class of animals
    ('File',typ.StringType()), # filename
    ('ID',typ.StringType()), # picture ID
    ('LBP0',typ.FloatType()), # LBP features
    ('LBP1',typ.FloatType()),
    ('LBP2',typ.FloatType()),
    ('LBP3',typ.FloatType()),
    ('LBP4',typ.FloatType()),
    ('LBP5',typ.FloatType()),
    ('LBP6',typ.FloatType()),
    ('LBP7',typ.FloatType()),
    ('LBP8',typ.FloatType()),
    ('LBP9',typ.FloatType()),
]
# Define Schema
schema = typ.StructType([
    typ.StructField(e[0],e[1],False) for e in labels
])

# CSV read
res_lbp = spark.read.csv('Res_LBP.csv',header=True,schema=schema)
# Select Target Animal
target_lbp = res_lbp.where(res_lbp.Animal.isin(target_animal))
target_lbp.printSchema()

root
 |-- ind: integer (nullable = true)
 |-- Animal: string (nullable = true)
 |-- File: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- LBP0: float (nullable = true)
 |-- LBP1: float (nullable = true)
 |-- LBP2: float (nullable = true)
 |-- LBP3: float (nullable = true)
 |-- LBP4: float (nullable = true)
 |-- LBP5: float (nullable = true)
 |-- LBP6: float (nullable = true)
 |-- LBP7: float (nullable = true)
 |-- LBP8: float (nullable = true)
 |-- LBP9: float (nullable = true)



In [7]:
target_lbp.head() #show 1st row

Row(ind=0, Animal='alligator', File='pic1000_Mayart5a.gif', ID='pic1000', LBP0=0.05400621145963669, LBP1=0.09041149169206619, LBP2=0.02454192563891411, LBP3=0.03751552850008011, LBP4=0.05066770315170288, LBP5=0.0609627328813076, LBP6=0.03319099545478821, LBP7=0.0975077673792839, LBP8=0.19192546606063843, LBP9=0.3592701852321625)

In [8]:
target_lbp.show() # show 20 row

+---+---------+--------------------+-------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|ind|   Animal|                File|     ID|       LBP0|       LBP1|       LBP2|       LBP3|       LBP4|       LBP5|       LBP6|       LBP7|       LBP8|       LBP9|
+---+---------+--------------------+-------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|  0|alligator|pic1000_Mayart5a.gif|pic1000| 0.05400621| 0.09041149|0.024541926| 0.03751553|0.050667703|0.060962733|0.033190995| 0.09750777| 0.19192547|  0.3592702|
|  1|alligator|pic1001_Mayart5f.gif|pic1001|0.110507585|0.095817834|0.031439807|0.032135498| 0.03360715|0.071495466|0.038075615| 0.08607818| 0.22409226|  0.2767506|
|  2|alligator|pic1002_cajun.14.jpg|pic1002| 0.04896794|0.049928635| 0.02091568|0.035518225| 0.04894049|0.103672594|0.030769654|  0.0590964| 0.45443565| 0.14775471|
|  3|allig

### 4. Py.SP: Make Ground Truth

In [9]:
# Python: Read .mat file
import scipy.io as sio # Library for .mat files
import re # Library for Regular Expression
file_path = 'D:\\Data\\AnimalsOnTheWeb\\' + target_animal + '\\'
file = 'animaldata_'+ target_animal + '.mat'
# Read from .mat files
data_read = sio.loadmat(os.path.join(file_path,file))

# truth table (1 or 0)
truth_tbl = list(data_read['gt'][0]) 

# get picture ID and save it to 'name' column
truth_nameread = list(data_read['imgnames'][0])
truth_name = [t[0] for t in truth_nameread]
truth_lists = pd.DataFrame({'name': truth_name,'truth': truth_tbl})
truth_lists['name'] = truth_lists['name'].astype('str')
re_picid = re.compile('pic\d+')
truth_lists['ID'] = [re_picid.findall(r)[0] for r in truth_lists['name']]
truth_lists.head()

Unnamed: 0,name,truth,ID
0,alligator/pic1_low_left.jpg,0,pic1
1,alligator/pic2_low_right.jpg,0,pic2
2,alligator/pic3_csm-amis.gif,0,pic3
3,alligator/pic4_csm-asin.gif,0,pic4
4,alligator/pic5_greenal.gif,0,pic5


In [10]:
# SPARK: convert pandas DF to Spark DF
df_truth = spark.createDataFrame(truth_lists)
df_truth.printSchema()

root
 |-- name: string (nullable = true)
 |-- truth: long (nullable = true)
 |-- ID: string (nullable = true)



In [11]:
# Cast Truth column to integer
df_truth = df_truth.withColumn('truth_int',df_truth['truth'].cast(typ.IntegerType()))
df_truth.printSchema()

root
 |-- name: string (nullable = true)
 |-- truth: long (nullable = true)
 |-- ID: string (nullable = true)
 |-- truth_int: integer (nullable = true)



In [12]:
#show 5 row
df_truth.show(5)

+--------------------+-----+----+---------+
|                name|truth|  ID|truth_int|
+--------------------+-----+----+---------+
|alligator/pic1_lo...|    0|pic1|        0|
|alligator/pic2_lo...|    0|pic2|        0|
|alligator/pic3_cs...|    0|pic3|        0|
|alligator/pic4_cs...|    0|pic4|        0|
|alligator/pic5_gr...|    0|pic5|        0|
+--------------------+-----+----+---------+
only showing top 5 rows



### 5. SP: join  features and Grd Truth dataframe

In [13]:
df_ml = df_truth.join(target_lbp,on='ID')
df_ml.head(5)

[Row(ID='pic1', name='alligator/pic1_low_left.jpg', truth=0, truth_int=0, ind=556, Animal='alligator', File='pic1_low_left.jpg', LBP0=0.08529064059257507, LBP1=0.0923176109790802, LBP2=0.034361232072114944, LBP3=0.07235414534807205, LBP4=0.1361340880393982, LBP5=0.08859997987747192, LBP6=0.032899968326091766, LBP7=0.09792628884315491, LBP8=0.12302567809820175, LBP9=0.2370903640985489),
 Row(ID='pic2', name='alligator/pic2_low_right.jpg', truth=0, truth_int=0, ind=667, Animal='alligator', File='pic2_low_right.jpg', LBP0=0.09944067895412445, LBP1=0.09836162626743317, LBP2=0.04460723698139191, LBP3=0.06056526303291321, LBP4=0.09713409096002579, LBP5=0.0673365369439125, LBP6=0.044231053441762924, LBP7=0.10398455709218979, LBP8=0.11735881119966507, LBP9=0.2669801414012909),
 Row(ID='pic3', name='alligator/pic3_csm-amis.gif', truth=0, truth_int=0, ind=778, Animal='alligator', File='pic3_csm-amis.gif', LBP0=0.01336000021547079, LBP1=0.02879999950528145, LBP2=0.012240000069141388, LBP3=0.03235

In [14]:
# Select columns from dataframe
df_ml1 = df_ml.select([c for c in df_ml.columns if c in ['truth_int','LBP0','LBP1','LBP2','LBP3','LBP4','LBP5','LBP6','LBP7','LBP8','LBP9']])
df_ml1.show(5)

+---------+-----------+-----------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+
|truth_int|       LBP0|       LBP1|       LBP2|       LBP3|      LBP4|      LBP5|       LBP6|       LBP7|      LBP8|      LBP9|
+---------+-----------+-----------+-----------+-----------+----------+----------+-----------+-----------+----------+----------+
|        0| 0.08529064| 0.09231761|0.034361232|0.072354145|0.13613409|0.08859998| 0.03289997| 0.09792629|0.12302568|0.23709036|
|        0| 0.09944068|0.098361626|0.044607237|0.060565263|0.09713409|0.06733654|0.044231053| 0.10398456|0.11735881|0.26698014|
|        0|    0.01336|     0.0288|    0.01224|    0.03236|   0.03372|   0.15308|    0.03644|    0.09984|     0.395|   0.19516|
|        0|    0.01216|    0.02852|    0.01032|    0.02904|    0.0306|   0.16036|    0.02836|    0.08796|   0.43788|    0.1748|
|        0|0.005221018|0.011943286|0.006880734|0.028348625|0.08780651|0.12346956|0.019332778|0.035696413

### 6. SP: Machine Learning

##### 6.1. Feature Creator

In [15]:
# make Feature column
import pyspark.ml.feature as ft
labels_feat =[
    ('LBP0',typ.FloatType()),
    ('LBP1',typ.FloatType()),
    ('LBP2',typ.FloatType()),
    ('LBP3',typ.FloatType()),
    ('LBP4',typ.FloatType()),
    ('LBP5',typ.FloatType()),
    ('LBP6',typ.FloatType()),
    ('LBP7',typ.FloatType()),
    ('LBP8',typ.FloatType()),
    ('LBP9',typ.FloatType()),
]
featuresCreator = ft.VectorAssembler(
    inputCols=[col[0] for col in labels_feat[0:]],outputCol='features'
)

##### 6.2. Make Classification Model

In [16]:
# make model
import pyspark.ml.classification as cl
logistic = cl.LogisticRegression(maxIter=10,regParam=0.01,labelCol='truth_int')

##### 6.3. Pipeline

In [17]:
# make pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[featuresCreator,logistic])

In [18]:
# Separate training and test data
lbp_train, lbp_test = df_ml1.randomSplit([0.7,0.3],seed=100)
# Train model
model = pipeline.fit(lbp_train)
# Test
test_model = model.transform(lbp_test) # get results on test dataset
test_model.take(1)

[Row(truth_int=0, LBP0=1.4100000043981709e-05, LBP1=0.0004844740033149719, LBP2=0.0002408639993518591, LBP3=0.004776970017701387, LBP4=0.007476892787963152, LBP5=0.032398831099271774, LBP6=0.007372186053544283, LBP7=0.014115380123257637, LBP8=0.9286929965019226, LBP9=0.004427279811352491, features=DenseVector([0.0, 0.0005, 0.0002, 0.0048, 0.0075, 0.0324, 0.0074, 0.0141, 0.9287, 0.0044]), rawPrediction=DenseVector([2.4751, -2.4751]), probability=DenseVector([0.9224, 0.0776]), prediction=0.0)]

### 7. SP: Evaluation

In [19]:
# Evaluation
import pyspark.ml.evaluation as ev
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',labelCol='truth_int')
print('Area Under ROC: ' + str(evaluator.evaluate(test_model, {evaluator.metricName:'areaUnderROC'})))

Area Under ROC: 0.8395751991254099


### 8. SP: K-means Clustering

In [20]:
# K-Means clustering

In [21]:
import pyspark.ml.clustering as clus
kmeans = clus.KMeans(k=10, featuresCol='features')

In [22]:
# make pipeline
pipeline = Pipeline(stages=[featuresCreator,kmeans])

In [23]:
# Get All class data
df_km = res_lbp.select([c for c in df_ml.columns if c in ['LBP0','LBP1','LBP2','LBP3','LBP4','LBP5','LBP6','LBP7','LBP8','LBP9']])
df_km.show(5)

+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+-----------+
|       LBP0|       LBP1|       LBP2|       LBP3|       LBP4|       LBP5|       LBP6|       LBP7|      LBP8|       LBP9|
+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+-----------+
| 0.05400621| 0.09041149|0.024541926| 0.03751553|0.050667703|0.060962733|0.033190995| 0.09750777|0.19192547|  0.3592702|
|0.110507585|0.095817834|0.031439807|0.032135498| 0.03360715|0.071495466|0.038075615| 0.08607818|0.22409226|  0.2767506|
| 0.04896794|0.049928635| 0.02091568|0.035518225| 0.04894049|0.103672594|0.030769654|  0.0590964|0.45443565| 0.14775471|
|   0.012288|    0.02336|   0.011776|   0.029568|    0.04608|   0.121856|   0.031424|   0.062464|  0.469376|   0.191808|
|0.004452055|0.018949771|0.007191781|0.040810503| 0.07351598| 0.18710046|0.022488585|0.028253425|0.54423517|0.073002286|
+-----------+-----------+-------

In [24]:
# Separate training and test data
km_train, km_test = df_km.randomSplit([0.7,0.3],seed=100)

In [25]:
model_km = pipeline.fit(km_train)
test_km = model_km.transform(km_test)

In [26]:
# Show Results
test_km.groupBy('prediction').agg({'*':'count','LBP0':'avg','LBP1':'avg'}).collect()

[Row(prediction=1, avg(LBP1)=0.005757329775375201, avg(LBP0)=0.003384214574995974, count(1)=206),
 Row(prediction=6, avg(LBP1)=0.06796377788520577, avg(LBP0)=0.051969669750504016, count(1)=531),
 Row(prediction=3, avg(LBP1)=0.0, avg(LBP0)=0.0, count(1)=92),
 Row(prediction=5, avg(LBP1)=0.028686339081703895, avg(LBP0)=0.01982155360940119, count(1)=328),
 Row(prediction=9, avg(LBP1)=0.04732494459435274, avg(LBP0)=0.035339908024555104, count(1)=428),
 Row(prediction=4, avg(LBP1)=0.016077543828599813, avg(LBP0)=0.011307594850847152, count(1)=303),
 Row(prediction=8, avg(LBP1)=0.07297128103308784, avg(LBP0)=0.060812290055194805, count(1)=332),
 Row(prediction=7, avg(LBP1)=0.10414538718692722, avg(LBP0)=0.10344831760692803, count(1)=692),
 Row(prediction=2, avg(LBP1)=0.05497169015126307, avg(LBP0)=0.04073618994007429, count(1)=471),
 Row(prediction=0, avg(LBP1)=0.08898658450206985, avg(LBP0)=0.0775589849119601, count(1)=1150)]