In [1]:
# Python Part

In [2]:
# python library
import random
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
# Target Animal Class
target_animal = 'monkey'

In [4]:
# read ground truth file
import scipy.io as sio # Library for .mat files
import re # Library for Regular Expression
file_path = 'D:\\Data\\AnimalsOnTheWeb\\' + target_animal + '\\'
file = 'animaldata_'+ target_animal + '.mat'
# Read from .mat files
data_read = sio.loadmat(os.path.join(file_path,file))

# truth table (1 or 0)
truth_tbl = list(data_read['gt'][0]) 

# get picture ID and save it to 'name' column
truth_nameread = list(data_read['imgnames'][0])
truth_name = [t[0] for t in truth_nameread]
truth_lists = pd.DataFrame({'name': truth_name,'truth': truth_tbl})
truth_lists['name'] = truth_lists['name'].astype('str')
re_picid = re.compile('pic\d+')
truth_lists['ID'] = [re_picid.findall(r)[0] for r in truth_lists['name']]
truth_lists.head()

Unnamed: 0,name,truth,ID
0,monkey/pic1_monkey_white2.gif,0,pic1
1,monkey/pic2_logotrans200.gif,0,pic2
2,monkey/pic3_DesignSurvey.gif,0,pic3
3,monkey/pic4_Popup.gif,0,pic4
4,monkey/pic5_Results.gif,0,pic5


In [5]:
# make LBP Features

In [6]:
#...

In [7]:
# Spark part

In [8]:
# Import library
import pyspark
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

In [9]:
#Make Session
sc = SparkContext()
spark = SparkSession(sc)

In [10]:
# read image files in directory and make it to dataframe
img_dir = "D:\\Data\\AnimalsOnTheWeb\\" + target_animal
imgs = spark.read.format("image").load(img_dir)
imgs.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [11]:
# read feature files

In [12]:
import pyspark.sql.types as typ
res_lbp = spark.read.csv('Res_LBP.csv',header=True)
labels =[
    ('ind',typ.IntegerType()), # index
    ('Animal',typ.StringType()), # Class of animals
    ('File',typ.StringType()), # filename
    ('ID',typ.StringType()), # picture ID
    ('LBP0',typ.FloatType()), # LBP features
    ('LBP1',typ.FloatType()),
    ('LBP2',typ.FloatType()),
    ('LBP3',typ.FloatType()),
    ('LBP4',typ.FloatType()),
    ('LBP5',typ.FloatType()),
    ('LBP6',typ.FloatType()),
    ('LBP7',typ.FloatType()),
    ('LBP8',typ.FloatType()),
    ('LBP9',typ.FloatType()),
]
# Define Schema
schema = typ.StructType([
    typ.StructField(e[0],e[1],False) for e in labels
])

# CSV read
res_lbp = spark.read.csv('Res_LBP.csv',header=True,schema=schema)
res_lbp.printSchema()

root
 |-- ind: integer (nullable = true)
 |-- Animal: string (nullable = true)
 |-- File: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- LBP0: float (nullable = true)
 |-- LBP1: float (nullable = true)
 |-- LBP2: float (nullable = true)
 |-- LBP3: float (nullable = true)
 |-- LBP4: float (nullable = true)
 |-- LBP5: float (nullable = true)
 |-- LBP6: float (nullable = true)
 |-- LBP7: float (nullable = true)
 |-- LBP8: float (nullable = true)
 |-- LBP9: float (nullable = true)



In [13]:
# Select animal to classify
target_lbp = res_lbp.where(res_lbp.Animal.isin(target_animal)) # only alligator

In [14]:
# Pandas Dataframe to Spark Dataframe
df_truth = spark.createDataFrame(truth_lists)
df_truth.printSchema()

root
 |-- name: string (nullable = true)
 |-- truth: long (nullable = true)
 |-- ID: string (nullable = true)



In [15]:
#show 5 row
df_truth.head(5)

[Row(name='monkey/pic1_monkey_white2.gif', truth=0, ID='pic1'),
 Row(name='monkey/pic2_logotrans200.gif', truth=0, ID='pic2'),
 Row(name='monkey/pic3_DesignSurvey.gif', truth=0, ID='pic3'),
 Row(name='monkey/pic4_Popup.gif', truth=0, ID='pic4'),
 Row(name='monkey/pic5_Results.gif', truth=0, ID='pic5')]

In [16]:
# Cast Truth column to integer
df_truth = df_truth.withColumn('truth_int',df_truth['truth'].cast(typ.IntegerType()))
df_truth.printSchema()

root
 |-- name: string (nullable = true)
 |-- truth: long (nullable = true)
 |-- ID: string (nullable = true)
 |-- truth_int: integer (nullable = true)



In [17]:
# join tables

In [18]:
df_ml = df_truth.join(target_lbp,on='ID')
df_ml.printSchema()

root
 |-- ID: string (nullable = true)
 |-- name: string (nullable = true)
 |-- truth: long (nullable = true)
 |-- truth_int: integer (nullable = true)
 |-- ind: integer (nullable = true)
 |-- Animal: string (nullable = true)
 |-- File: string (nullable = true)
 |-- LBP0: float (nullable = true)
 |-- LBP1: float (nullable = true)
 |-- LBP2: float (nullable = true)
 |-- LBP3: float (nullable = true)
 |-- LBP4: float (nullable = true)
 |-- LBP5: float (nullable = true)
 |-- LBP6: float (nullable = true)
 |-- LBP7: float (nullable = true)
 |-- LBP8: float (nullable = true)
 |-- LBP9: float (nullable = true)



In [19]:
# Select Feature and result column

In [20]:
df_ml1 = df_ml.select([c for c in df_ml.columns if c in ['truth_int','LBP0','LBP1','LBP2','LBP3','LBP4','LBP5','LBP6','LBP7','LBP8','LBP9']])

In [21]:
df_ml1.printSchema()

root
 |-- truth_int: integer (nullable = true)
 |-- LBP0: float (nullable = true)
 |-- LBP1: float (nullable = true)
 |-- LBP2: float (nullable = true)
 |-- LBP3: float (nullable = true)
 |-- LBP4: float (nullable = true)
 |-- LBP5: float (nullable = true)
 |-- LBP6: float (nullable = true)
 |-- LBP7: float (nullable = true)
 |-- LBP8: float (nullable = true)
 |-- LBP9: float (nullable = true)



In [22]:
# make Feature column

In [23]:
import pyspark.ml.feature as ft

In [24]:
labels1 =[
    ('LBP0',typ.FloatType()),
    ('LBP1',typ.FloatType()),
    ('LBP2',typ.FloatType()),
    ('LBP3',typ.FloatType()),
    ('LBP4',typ.FloatType()),
    ('LBP5',typ.FloatType()),
    ('LBP6',typ.FloatType()),
    ('LBP7',typ.FloatType()),
    ('LBP8',typ.FloatType()),
    ('LBP9',typ.FloatType()),
]

In [25]:
featuresCreator = ft.VectorAssembler(
    inputCols=[col[0] for col in labels1[0:]],outputCol='features'
)

In [26]:
# make model

In [27]:
import pyspark.ml.classification as cl

In [28]:
logistic = cl.LogisticRegression(maxIter=10,regParam=0.01,labelCol='truth_int')

In [29]:
# make pipeline

In [30]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[featuresCreator,logistic])

In [31]:
# Separate training and test data
lbp_train, lbp_test = df_ml1.randomSplit([0.7,0.3],seed=100)

In [32]:
model = pipeline.fit(lbp_train) #train model

In [33]:
test_model = model.transform(lbp_test) # get results on test dataset
test_model.take(1)

[Row(truth_int=0, LBP0=0.0, LBP1=0.0, LBP2=0.0, LBP3=0.0, LBP4=0.0, LBP5=0.0, LBP6=0.0, LBP7=0.0, LBP8=0.0, LBP9=0.0, features=SparseVector(10, {}), rawPrediction=DenseVector([2.4983, -2.4983]), probability=DenseVector([0.924, 0.076]), prediction=0.0)]

In [34]:
# Evaluation

In [35]:
import pyspark.ml.evaluation as ev

In [36]:
evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',labelCol='truth_int')
print('Area under ROC curve: ' + str(evaluator.evaluate(test_model, {evaluator.metricName:'areaUnderROC'})))

Area under ROC curve: 0.8494523973784067


In [37]:
# K-Means clustering

In [38]:
import pyspark.ml.clustering as clus
kmeans = clus.KMeans(k=10, featuresCol='features')

In [39]:
pipeline = Pipeline(stages=[featuresCreator,kmeans])

In [42]:
df_km = res_lbp.select([c for c in df_ml.columns if c in ['LBP0','LBP1','LBP2','LBP3','LBP4','LBP5','LBP6','LBP7','LBP8','LBP9']])

In [45]:
# Separate training and test data
km_train, km_test = df_km.randomSplit([0.7,0.3],seed=100)

In [47]:
model_km = pipeline.fit(km_train)

In [48]:
test_km = model_km.transform(km_test)

In [54]:
test_km.groupBy('prediction').agg({'*':'count','LBP0':'avg','LBP1':'avg'}).collect()

[Row(prediction=1, avg(LBP1)=0.006628613927448388, avg(LBP0)=0.004168090764472063, count(1)=258),
 Row(prediction=6, avg(LBP1)=0.05245462622073535, avg(LBP0)=0.03945028164768144, count(1)=479),
 Row(prediction=3, avg(LBP1)=0.06755132657004359, avg(LBP0)=0.05124026305691833, count(1)=162),
 Row(prediction=5, avg(LBP1)=0.019706812097173957, avg(LBP0)=0.0134566055527327, count(1)=405),
 Row(prediction=9, avg(LBP1)=0.06786785193954606, avg(LBP0)=0.05170888021909987, count(1)=498),
 Row(prediction=4, avg(LBP1)=0.10401775009862664, avg(LBP0)=0.10296872409029272, count(1)=773),
 Row(prediction=8, avg(LBP1)=0.0, avg(LBP0)=0.0, count(1)=92),
 Row(prediction=7, avg(LBP1)=0.08957870951917984, avg(LBP0)=0.07801360607926075, count(1)=1014),
 Row(prediction=2, avg(LBP1)=0.03819214777923752, avg(LBP0)=0.028137821773230865, count(1)=364),
 Row(prediction=0, avg(LBP1)=0.06482088577467948, avg(LBP0)=0.051403644548626004, count(1)=488)]