In [1]:
import findspark
findspark.init()
import pyspark
import random
import os

In [2]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext()
spark = SparkSession(sc)

In [3]:
img_dir = "D:\\Data\\AnimalsOnTheWeb\\alligator"

In [4]:
imgs = spark.read.format("image").load(img_dir)

In [5]:
imgs.show()

+--------------------+
|               image|
+--------------------+
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
|[file:///D:/Data/...|
+--------------------+
only showing top 20 rows



In [6]:
imgs.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



In [7]:
import pyspark.sql.types as typ

In [8]:
res_lbp = spark.read.csv('Res_LBP.csv',header=True)
labels =[
    ('ind',typ.IntegerType()),
    ('Animal',typ.StringType()),
    ('File',typ.StringType()),
    ('ID',typ.StringType()),
    ('LBP0',typ.FloatType()),
    ('LBP1',typ.FloatType()),
    ('LBP2',typ.FloatType()),
    ('LBP3',typ.FloatType()),
    ('LBP4',typ.FloatType()),
    ('LBP5',typ.FloatType()),
    ('LBP6',typ.FloatType()),
    ('LBP7',typ.FloatType()),
    ('LBP8',typ.FloatType()),
    ('LBP9',typ.FloatType()),
]
schema = typ.StructType([
    typ.StructField(e[0],e[1],False) for e in labels
])

In [9]:
res_lbp = spark.read.csv('Res_LBP.csv',header=True,schema=schema)
res_lbp.printSchema()

root
 |-- ind: integer (nullable = true)
 |-- Animal: string (nullable = true)
 |-- File: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- LBP0: float (nullable = true)
 |-- LBP1: float (nullable = true)
 |-- LBP2: float (nullable = true)
 |-- LBP3: float (nullable = true)
 |-- LBP4: float (nullable = true)
 |-- LBP5: float (nullable = true)
 |-- LBP6: float (nullable = true)
 |-- LBP7: float (nullable = true)
 |-- LBP8: float (nullable = true)
 |-- LBP9: float (nullable = true)



In [10]:
# include only records with these scores in list l
res_lbp = res_lbp.where(res_lbp.Animal.isin('alligator'))
# expected: (1,10), (1,20), (3,18), (3,18), (3,18)

In [11]:
res_lbp.count()

1445

In [94]:
import scipy.io as sio
file_path = 'D:\\Data\\AnimalsOnTheWeb\\alligator\\'
file = 'animaldata_alligator.mat'
data_read = sio.loadmat(os.path.join(file_path,file))

In [95]:
import numpy as np
import pandas as pd
import re

In [96]:
#aa = truth_name

In [97]:
truth_tbl = list(data_read['gt'][0])
truth_nameread = list(data_read['imgnames'][0])
truth_name = [t[0] for t in truth_nameread]

In [98]:
truth_lists = pd.DataFrame({'name': truth_name,'truth': truth_tbl})
#truth_lists = pd.DataFrame({'truth': truth_tbl})
truth_lists['name'] = truth_lists['name'].astype('str')

In [99]:
re_picid = re.compile('pic\d+')
aa = re_picid.findall(truth_lists['name'][0])
truth_lists['ID'] = [re_picid.findall(r)[0] for r in truth_lists['name']]

In [100]:
truth_lists.head()

Unnamed: 0,name,truth,ID
0,alligator/pic1_low_left.jpg,0,pic1
1,alligator/pic2_low_right.jpg,0,pic2
2,alligator/pic3_csm-amis.gif,0,pic3
3,alligator/pic4_csm-asin.gif,0,pic4
4,alligator/pic5_greenal.gif,0,pic5


In [101]:
df_truth = spark.createDataFrame(truth_lists)#,schema=schema1)
df_truth.printSchema()
df_truth.head(5)

root
 |-- name: string (nullable = true)
 |-- truth: long (nullable = true)
 |-- ID: string (nullable = true)



[Row(name='alligator/pic1_low_left.jpg', truth=0, ID='pic1'),
 Row(name='alligator/pic2_low_right.jpg', truth=0, ID='pic2'),
 Row(name='alligator/pic3_csm-amis.gif', truth=0, ID='pic3'),
 Row(name='alligator/pic4_csm-asin.gif', truth=0, ID='pic4'),
 Row(name='alligator/pic5_greenal.gif', truth=0, ID='pic5')]

In [102]:
#aa = df_truth.filter(df_truth.name.rlike('pic\d+_'))
#aa.head(15)

In [103]:
#split_col = pyspark.sql.functions.split(df_truth['name'], '\d')
#df_truth = df_truth.withColumn('NAME1', split_col.getItem(0))

In [106]:
df_ml = df_truth.join(res_lbp,on='ID')
df_ml

DataFrame[ID: string, name: string, truth: bigint, ind: int, Animal: string, File: string, LBP0: float, LBP1: float, LBP2: float, LBP3: float, LBP4: float, LBP5: float, LBP6: float, LBP7: float, LBP8: float, LBP9: float]

In [120]:
df_ml.head(1)

[Row(ID='pic1', name='alligator/pic1_low_left.jpg', truth=0, ind=556, Animal='alligator', File='pic1_low_left.jpg', LBP0=0.08529064059257507, LBP1=0.0923176109790802, LBP2=0.034361232072114944, LBP3=0.07235414534807205, LBP4=0.1361340880393982, LBP5=0.08859997987747192, LBP6=0.032899968326091766, LBP7=0.09792628884315491, LBP8=0.12302567809820175, LBP9=0.2370903640985489)]

In [131]:
df_ml1 = df_ml.select([c for c in df_ml.columns if c in ['truth','LBP0','LBP1','LBP2','LBP3','LBP4','LBP5','LBP6','LBP7','LBP8','LBP9']])

In [140]:
import pyspark.ml.feature as ft

In [158]:
labels1 =[
    ('truth',typ.IntegerType()),
    ('LBP0',typ.FloatType()),
    ('LBP1',typ.FloatType()),
    ('LBP2',typ.FloatType()),
    ('LBP3',typ.FloatType()),
    ('LBP4',typ.FloatType()),
    ('LBP5',typ.FloatType()),
    ('LBP6',typ.FloatType()),
    ('LBP7',typ.FloatType()),
    ('LBP8',typ.FloatType()),
    ('LBP9',typ.FloatType()),
]

In [159]:
featuresCreator = ft.VectorAssembler(
    inputCols=[col[0] for col in labels1[0:]],outputCol='features'
)

In [160]:
import pyspark.ml.classification as cl

In [161]:
logistic = cl.LogisticRegression(maxIter=10,regParam=0.01,labelCol='truth')

In [162]:
from pyspark.ml import Pipeline

In [163]:
pipeline = Pipeline(stages=[featuresCreator,logistic])

In [166]:
lbp_train, lbp_test = df_ml1.randomSplit([0.7,0.3],seed=1)

In [167]:
model = pipeline.fit(lbp_train)