In [5]:
# All the imports you will need in the whole lab
from skimage.feature import greycomatrix, greycoprops
from skimage import io
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
import os
import numpy as np
import cv2
import imutils
from skimage.transform import (hough_line, hough_line_peaks)
from skimage.feature import greycomatrix, greycoprops, canny, corner_harris
from skimage.morphology import binary_erosion, binary_dilation, binary_closing,skeletonize, thin
from sklearn.model_selection import train_test_split
from skimage.filters import sobel_h, sobel, sobel_v,roberts, prewitt,threshold_otsu
import glob
import math
from matplotlib import cm
from skimage.measure import find_contours

from sklearn.tree import DecisionTreeClassifier


In [6]:
def show_images(images, titles=None):
    # This function is used to show image(s) with titles by sending an array of images and an array of associated titles.
    # images[0] will be drawn with the title titles[0] if exists
    # You aren't required to understand this function, use it as-is.
    n_ims = len(images)
    if titles is None:
        titles = ['(%d)' % i for i in range(1, n_ims + 1)]
    fig = plt.figure()
    n = 1
    for image, title in zip(images, titles):
        a = fig.add_subplot(1, n_ims, n)
        if image.ndim == 2:
            plt.gray()
        plt.imshow(image)
        a.set_title(title)
        n += 1
    fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
    plt.show()

In [7]:
def preprocess(img):
    img = np.asarray(img)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    return img

In [8]:
def preprocessing(gray):


    img = cv2.GaussianBlur(gray,(3,3),0)

    # convolute with proper kernels
    dest = cv2.Laplacian(gray, cv2.CV_16S, ksize=3)
    abs_dest = cv2.convertScaleAbs(dest)
   
    Otsu_Threshold = threshold_otsu(gray)   
    binary = gray < Otsu_Threshold  
    skeleton_img =  skeletonize(binary)

    #separating the diac.
    H = np.sum(binary,axis = 1)
    I = np.argmax(H)
    point = [(x,I) for x in range(binary.shape[0])]
    # print(I)
    h, w = binary.shape[:2]
    # print(h,w)
    mask = np.zeros((h+2, w+2), np.uint8)
    # print(mask)
    binary = binary < 1 
    binary = binary.astype(np.uint8)
    # show_images([binary])
    for j in range(binary.shape[1] - 1):
        if binary[I][j] == 0 and binary[I][j+1] == 1:
            cv2.floodFill(binary,mask,seedPoint = (j,I),newVal = 1) 
    # print(b)
    text = mask
    diacritics = binary
    return img,abs_dest,skeleton_img,text,diacritics

In [9]:
def HVSL(edge):
    horizontal = edge.copy()
    vertical = edge.copy()
    H = 0
    V = 0 
    
    cols = horizontal.shape[1]
    horizontal_size = cols / 30
    horizontalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (int(horizontal_size + 4), 1))
    horizontal = cv2.erode(horizontal, horizontalStructure)
    horizontal = cv2.dilate(horizontal, horizontalStructure)
    Otsu_Threshold = threshold_otsu(horizontal)   
    horizontal = horizontal > Otsu_Threshold 
    H, output, stats, centroids = cv2.connectedComponentsWithStats(horizontal.astype(np.uint8),connectivity=8)
    


    rows = vertical.shape[0]
    verticalsize = rows / 30
    verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1,int(verticalsize + 4)))
    vertical = cv2.erode(vertical, verticalStructure)
    vertical = cv2.dilate(vertical.astype(np.uint8), verticalStructure)
    Otsu_Threshold = threshold_otsu(vertical)   
    vertical = vertical > Otsu_Threshold 

    V, output, stats, centroids = cv2.connectedComponentsWithStats(vertical.astype(np.uint8),connectivity=8)

    
    # for i in range(1,horizontal.shape[0] ):    
    #     for j in range(horizontal.shape[1]):
    #         if horizontal[i][j] == 1 and horizontal[i - 1 ][j] == 0:
    #             H += 1

    

    # vertical_z = binary_erosion(vertical,[[1,1,1,1],[0,0,0,0],[0,0,0,0],[0,0,0,0]])
    # print(horizontal,vertical)

    # for i in range(vertical.shape[0] - 1):    
    #     for j in range(1,vertical.shape[1] - 1):
            
    #         if vertical[i][j] == 1 and vertical[i][j-1] == 0:
    #             V += 1
    # sizes = stats[1:, -1]
    # V = V - 1

    # # your answer image
    # img2 = vertical
    # # for every component in the image, you keep it only if it's above min_size
    # for i in range(0, V):
    #     if sizes[i] < 20:
    #         img2[output == i + 1] = 0

    # houghSpace,angles, dists = hough_line(horizontal)
    # houghSpace,angles, dists = hough_line_peaks(houghSpace,angles,dists,threshold=0.5*np.amax(houghSpace))
    # H = 0
    # V = 0
    # for angle, dist in zip(angles,dists):
    #     if(angle == 0 or angle == math.pi):
    #         H += 1
    #     elif(angle == math.pi/2 or angle == 1.5 * math.pi):
    #         V += 1
    return H - 1,V - 1


In [10]:
def TOE(edge):
    houghSpace,angles, dists = hough_line(edge)
    houghSpace,angles, dists = hough_line_peaks(houghSpace,angles,dists,threshold=0.5*np.amax(houghSpace))
    start = -2
    bin = []
    while True :
        if(start >= 2):
            break
        bin.append(start)
        start += 0.01
    bin = [round(bins,2) for bins in bin ]
    angles = [round(angle,2) for angle in angles ]

    hist , bins = np.histogram(angles,bins = bin)
    return hist,bins

In [11]:
def TOS(skeleton):
    houghSpace,angles, dists = hough_line(skeleton)
    houghSpace,angles, dists = hough_line_peaks(houghSpace,angles,dists,threshold=0.5*np.amax(houghSpace))

    start = -2
    bin = []
    while True :
        if(start >= 2):
            break
        bin.append(start)
        start += 0.01
    bin = [round(bins,2) for bins in bin ]
    angles = [round(angle,2) for angle in angles ]
    hist , bins = np.histogram(angles,bins = bin)
    return hist,bins

In [12]:
def LVL(ske):
    vertical = ske.copy()
    V = 0 
    rows = vertical.shape[0]
    verticalsize = rows // 30
    verticalStructure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, int(verticalsize + 4)))
    vertical = cv2.erode(vertical.astype(np.uint8), verticalStructure)
    vertical = cv2.dilate(vertical.astype(np.uint8), verticalStructure)
    Otsu_Threshold = threshold_otsu(vertical)   
    vertical = vertical > Otsu_Threshold 
    V, output, stats, centroids = cv2.connectedComponentsWithStats(vertical.astype(np.uint8),connectivity=8)
    sizes = stats[1:, -1]

    Otsu_Threshold = threshold_otsu(ske)   
    ske = ske > Otsu_Threshold 
    min = math.inf
    max = -1
    for i in range(ske.shape[0]):
        for j in range(ske.shape[1]):
            if (ske[i][j] == 1):
                if i > max:
                    max = i
                elif i < min:
                    min = i
    text_hight = max - min
    num_VL = V - 1
    if(len(sizes) == 0):
        sizes = [2,1,2]
    higtest_VL = np.max(sizes)
    drvt = text_hight - higtest_VL
    variance = np.var(sizes)
    # print(sizes)
    # print(text_hight,"text hight")
    # print(num_VL,"num_VL")
    # print(higtest_VL,"higtest_VL")
    # print(drvt,"drvt")
    # print(variance,"variance")
    return text_hight,num_VL,higtest_VL,drvt,variance


In [13]:
def Tth(edge,ske):
    Otsu_Threshold = threshold_otsu(ske)   
    ske = ske > Otsu_Threshold 

    Otsu_Threshold = threshold_otsu(edge)   
    edge = edge > Otsu_Threshold 

    dest_up = []
    dest_down = []
    for i in range(ske.shape[0]):
        for j in range(ske.shape[1]):
            if (ske[i][j] == 1):
                c = True
                for k in range(i + 1,edge.shape[0]):
                    if(edge[k][j] == 1):
                        c = False
                        d = abs(k - i)
                        dest_down.append(d)
                        break
                if(c):
                    dest_down.append(0)        
                
                u = 0
                for k in range(0, i):
                    if(edge[k][j] == 1):
                        u = k
                dest_up.append(abs(u-i))
    return dest_down + dest_up

In [14]:
def SDs(diacritics):

    arra  = np.zeros(diacritics.shape)
    arra = diacritics == 0

    contours = find_contours(arra ,level = 0.2,fully_connected='high')

    
    m1 = rgb2gray(io.imread("mark1.jpg"))
    Otsu_Threshold = threshold_otsu(m1)   
    m1 = m1 < Otsu_Threshold 


    m2 = rgb2gray(io.imread("mark2.jpg"))
    Otsu_Threshold = threshold_otsu(m2)   
    m2 = m2 < Otsu_Threshold 

    dist_1 = []
    dist_2 = []
    bounding_boxes = []
    for contour in contours:
        Y_Values = np.asarray(contour[:,0])
        X_Values = np.asarray(contour[:,1])
        bounding_boxes.append([
        int(np.amin(X_Values)),
        int(np.amax(X_Values)),
        int(np.amin(Y_Values)),
        int(np.amax(Y_Values))])

    for box in bounding_boxes:
        [Xmin, Xmax, Ymin, Ymax] = box
        dist_1.append(cv2.matchShapes(arra[Ymin:Ymax,Xmin:Xmax].astype(np.uint8),m1.astype(np.uint8),cv2.CONTOURS_MATCH_I2,0))
        dist_2.append(cv2.matchShapes(arra[Ymin:Ymax,Xmin:Xmax].astype(np.uint8),m2.astype(np.uint8),cv2.CONTOURS_MATCH_I2,0))
    if (len(dist_1) == 0 ):
        return 0 , 0 
    return np.min(dist_1),np.min(dist_2)

In [15]:
def WOR(text):
    contours, _ = cv2.findContours(text.astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
    angles = []
    for i, c in enumerate(contours):
        
        # Calculate the area of each contour
        area = cv2.contourArea(c)
        
        # Ignore contours that are too small or too large
        rect = cv2.minAreaRect(c)
        box = cv2.boxPoints(rect)
        box = np.int0(box)

        center = (int(rect[0][0]),int(rect[0][1])) 
        width = int(rect[1][0])
        height = int(rect[1][1])
        angle = int(rect[2])

        if width < height:
            angle = 90 - angle
        else:
            angle = -angle
        angles.append(angle)
    angles = np.sort(angles)[:-1]
    ori = np.average(angles)
    if(len(angles) == 0):
        return 0
    return(ori)

In [16]:
def HPP(img):
    H = np.sum(img[1:img.shape[0]-1,:],axis = 1)
    I = np.argmax(H)
    H = np.sort(H)
    if(I == 0):
        I = 1
    hpp = np.sum(img[:,1:img.shape[1] - 1]) / I
    return H[-3:],hpp

In [17]:
# img,edges,ske,text,diacritics = preprocessing("ACdata_base/9/1510.JPG")
# Otsu_Threshold = threshold_otsu(img)   
# img = img < Otsu_Threshold
# show_images([img,edges,ske,text,diacritics])
# H,V = HVSL(edges)
# hist_e,b_e = TOE(edges)
# hist_s,b_s = TOS(ske)
# LVL(ske)
# Tth(edges,ske)
# SDs(diacritics)
# HPP(text)
# WOR(text)

In [18]:
def readData():
    xData = []
    yData = []
    for filename in sorted(glob.glob('ACdata_base/1/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(1)
    for filename in sorted(glob.glob('ACdata_base/2/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(2)
    for filename in sorted(glob.glob('ACdata_base/3/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(3)
    for filename in sorted(glob.glob('ACdata_base/4/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(4)
    for filename in sorted(glob.glob('ACdata_base/5/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(5)
    for filename in sorted(glob.glob('ACdata_base/6/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(6)
    for filename in sorted(glob.glob('ACdata_base/7/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(7)
    for filename in sorted(glob.glob('ACdata_base/8/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(8)
    for filename in sorted(glob.glob('ACdata_base/9/*.jpg')):
        img = cv2.imread(filename)
        img = preprocess(img)
        xData.append(img)
        yData.append(9)
    xTrain, xTest, yTrain, yTest = train_test_split(np.asarray(xData), np.asarray(yData), test_size = 0.2, shuffle = True)
    return xTrain, xTest, yTrain, yTest

In [47]:
def Featur_Extraction(xTrain):
    Features = []
    for gray in xTrain:
        img_feature = []
        img,edges,ske,text,diacritics = preprocessing(gray)
        Otsu_Threshold = threshold_otsu(img)   
        img = img < Otsu_Threshold

        H,V = HVSL(edges)
        img_feature.append(H)
        img_feature.append(V)

        hist_e , b_e = TOE(edges)
        hist_s , b_s = TOS(ske)
        for i in range(len(hist_e)):
            img_feature.append(hist_e[i])
            img_feature.append(hist_s[i])
        
        text_hight,num_VL,higtest_VL,drvt,variance = LVL(ske)
        img_feature.append(text_hight)
        img_feature.append(num_VL)
        img_feature.append(higtest_VL)
        img_feature.append(drvt)
        img_feature.append(variance)

        Thickness = Tth(edges,ske)
        for i in range(200):
            img_feature.append(Thickness[i])
        
        d1 , d2 = SDs(diacritics)
        img_feature.append(d1)
        img_feature.append(d2)
        
        H , hpp = HPP(text)
        img_feature.append(H[0])
        img_feature.append(H[1])
        img_feature.append(H[2])
        img_feature.append(hpp)
         
        ori = WOR(text)
        img_feature.append(ori)
        Features.append(img_feature)
        
    return Features

In [48]:
xTrain, xTest, yTrain, yTest = readData()
xTrain = Featur_Extraction(xTrain)


  xTrain, xTest, yTrain, yTest = train_test_split(np.asarray(xData), np.asarray(yData), test_size = 0.2, shuffle = True)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [None]:
def read_data(file_name):
    Data = np.genfromtxt(file_name,delimiter = ",")
    ## HINT 1: How is the data ordered in the file?
    ## HINT 2: Do you need to cast the data you read from the file?
    return Data

In [54]:
def Featur_Extraction_test(gray):
    img_feature = []
    img,edges,ske,text,diacritics = preprocessing(gray)
    Otsu_Threshold = threshold_otsu(img)   
    img = img < Otsu_Threshold

    H,V = HVSL(edges)
    img_feature.append(H)
    img_feature.append(V)

    hist_e , b_e = TOE(edges)
    hist_s , b_s = TOS(ske)
    for i in range(len(hist_e)):
        img_feature.append(hist_e[i])
        img_feature.append(hist_s[i])
    
    text_hight,num_VL,higtest_VL,drvt,variance = LVL(ske)
    img_feature.append(text_hight)
    img_feature.append(num_VL)
    img_feature.append(higtest_VL)
    img_feature.append(drvt)
    img_feature.append(variance)

    Thickness = Tth(edges,ske)
    for i in range(200):
        img_feature.append(Thickness[i])
    
    d1 , d2 = SDs(diacritics)
    img_feature.append(d1)
    img_feature.append(d2)
    
    H , hpp = HPP(text)
    # diff = abs((H[0] - H[1]) + (H[1] - H[2]))
    img_feature.append(H[0])
    img_feature.append(H[1])
    img_feature.append(H[2])
    img_feature.append(hpp)
        
    ori = WOR(text)
    img_feature.append(ori)
    return img_feature

In [50]:
def calculateDistance(x1, x2):
    distance = np.linalg.norm(x1 - x2)
    return distance

In [51]:
def MinimumDistanceClassifier(testPoint, trainingFeatures, yTrain):
    distances = list()
    centers = [np.mean(trainingFeatures[yTrain == 1], axis=0), np.mean(trainingFeatures[yTrain == 2], axis=0), np.mean(trainingFeatures[yTrain == 3], axis=0), np.mean(trainingFeatures[yTrain == 4], axis=0), np.mean(trainingFeatures[yTrain == 5], axis=0), np.mean(trainingFeatures[yTrain == 6], axis=0), np.mean(trainingFeatures[yTrain == 7], axis=0), np.mean(trainingFeatures[yTrain == 8], axis=0), np.mean(trainingFeatures[yTrain == 9], axis=0)]
    for i in range(len(centers)):
        distances.append(calculateDistance(testPoint, centers[i]))
    classification = np.argmin(distances)
    return classification

In [52]:
minDistPredictions = np.zeros(yTest.shape)
classes = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
print(np.asarray(xTrain).astype(np.uint8).shape)
print(yTrain.shape)
for i in range(len(xTest)):
    testPoint = Featur_Extraction_test(xTest[i])
    # print(np.asarray(testPoint).astype(np.uint8))
    # fig = plt.figure()
    # plt.imshow(xTest[i])
    # plt.axis("off")
    # plt.show()
    # print("Actual class :", yTest[i])
    # print("---------------------------------------")
    minDistPrediction = MinimumDistanceClassifier(np.asarray(testPoint).astype(np.uint8), np.asarray(xTrain).astype(np.uint8), yTrain)
    minDistPredictions[i] = classes[minDistPrediction]
    # print("Minimum Distance Classifier Prediction   :", classes[minDistPrediction])
    # print("===========================================================================")
print(np.sum(minDistPredictions == yTest))
mdcAccuracy = (np.sum(minDistPredictions == yTest) / len(yTest)) * 100
print("Minimum Distance Classifier Accuracy: ", mdcAccuracy, "%")

(1348, 1012)
(1348,)


  diff = abs((H[0] - H[1]) + (H[1] - H[2]))


203
Minimum Distance Classifier Accuracy:  60.23738872403561 %


In [59]:
# numClasses = 9
# M = len(xTrain)
# K = len(xTest)
# N = len(xTrain[0])

# X = np.asarray(xTrain)
# X_Test = xTest
# Y = np.uint8(np.asarray(yTrain))

xTrain = np.asarray(xTrain)
xTest = np.asarray(xTest)
numClasses = 9
M = np.asarray(xTrain).shape[0]
N = np.asarray(xTrain).shape[1] 
K = np.asarray(xTest).shape[0]

X = np.asarray(xTrain)[:, 1:]
X_Test = xTest[:]
Y = np.reshape(yTrain, (M, 1))

In [128]:
# pClasses = [] # A list of size (numClasses, 1) containing the a priori probabilities of each class in the training set.

# estimate_means = [] # A numpy array of size (numClasses, N) containing the mean points of each class in the training set. 
#                     # HINT: USE NP.MEAN

# estimate_covariances = [] # A numpy array of size (numClasses, N, N) containing the covariance matrices of each class in the training set.
#                           # HINT: USE NP.COV (Pay attenention for what it takes as an argument)
# # print(Y)  
# for classIndex in range(numClasses):
#     # TODO [5]: Estimate the parameters of the Gaussian distributions of the given classes.
#     # Fill pClasses, estimate_means, and estimate_covariances in this part 
#     # Your code should be vectorized WITHOUT USING A SINGLE FOR LOOP.
#     # print(len(Y == classIndex + 1),len(X))
#     pClasses.append(len(X[Y == classIndex + 1])/len(X))
#     estimate_means.append(np.mean(X[Y == classIndex + 1],axis= 0))
#     # find out covariance with respect  columns
#     # print(X[Y == classIndex + 1].shape)
#     cov_mat = np.stack(X[Y == classIndex + 1], axis = 0)
#     # print(cov_mat.shape)
#     arr = []
#     for i in range(len(X)):
#         if(i == 1010):
#             break
#         arr.append(X[Y == classIndex + 1][:,i])
        
#     # estimate_covariances.append(np.cov(np.asarray(cov_mat).T))
#     estimate_covariances.append(np.cov(X[Y == classIndex + 1][:, :].T))
#     pass
# estimate_means = np.array(estimate_means)
# estimate_covariances = np.array(estimate_covariances)

pClasses = [] # A list of size (numClasses, 1) containing the a priori probabilities of each class in the training set.

estimate_means = [] # A numpy array of size (numClasses, N) containing the mean points of each class in the training set. 
                    # HINT: USE NP.MEAN

estimate_covariances = [] # A numpy array of size (numClasses, N, N) containing the covariance matrices of each class in the training set.
                          # HINT: USE NP.COV (Pay attenention for what it takes as an argument)
                             
for classIndex in range(numClasses):
    # TODO [5]: Estimate the parameters of the Gaussian distributions of the given classes.
    # Fill pClasses, estimate_means, and estimate_covariances in this part 
    # Your code should be vectorized WITHOUT USING A SINGLE FOR LOOP.
    _, pClasses = np.unique(xTrain[:, 0], return_counts=True)
    pClasses = pClasses / M
    estimate_means.append(np.mean(xTrain[xTrain[:, 0] == (classIndex + 1)][:, 1:], axis=0))
    estimate_covariances.append(np.cov(xTrain[xTrain[:, 0] == (classIndex + 1)][:, 1:].T))

estimate_means = np.array(estimate_means)
estimate_covariances = np.array(estimate_covariances)


In [129]:
print(np.linalg.inv(estimate_covariances[4]))

LinAlgError: Singular matrix

In [63]:
# TODO 6: Implement the multivariate normal gaussian distribution with parameters mu and sigma, and return the
#  value in prob.
# HINT: Calculate each part of the equation first, then combine them. 
#      That is: calulate co-efficient first, the parameter of the exponentiation, then combine them
def multivariate_normal_gaussian(X, mu, sigma):
    coefficient = 1 / (((2 * np.pi) ** (X.shape[0] / 2)) * (np.linalg.det(sigma) ** 0.5))
    expParameter = -0.5 * np.matmul(np.matmul((X - mu).T, np.linalg.inv(sigma)), (X - mu))
    prob = coefficient * np.exp(expParameter)
    return prob

In [64]:
# TODO [7]: Apply the Bayesian Classifier to predict the classes of the test points.
predicted_classes = [] # predicted_classes: A numpy array of size (K, 1) where K is the number of points in the test set. Every element in this array
                       # contains the predicted class of Bayes classifier for this test point.

for i in range(X_Test.shape[0]):
    testPoint = np.asarray(Featur_Extraction_test(X_Test[i]))
    classProbabilities = np.zeros(numClasses)
    # TODO [7.A]: Compute the probability that the test point X_Test[i] belongs to each class in numClasses.
    #  Fill the array classProbabilities accordingly.
    for j in range(numClasses):
        classProbabilities[j] = multivariate_normal_gaussian(testPoint,estimate_means[j],estimate_covariances[j])

    # TODO [7.B]: Find the prediction of the test point X_Test[i] and append it to the predicted_classes array.
    predicted_classes.append(np.argmax(classProbabilities) + 1) 
# accuracy = np.count_nonzero(test_data_true[test_data_true == predicted_classes]) / K
BayesianClassifierAccuracy = (np.sum(predicted_classes == yTest) / len(yTest)) * 100
print("Bayesian Classifier Accuracy: ", BayesianClassifierAccuracy, "%")
# print('Accuracy = ' + str(round(BayesianClassifierAccuracy,4) * 100) + '%')



OverflowError: (34, 'Result too large')

In [111]:
import utils

ModuleNotFoundError: No module named 'utils'

In [96]:
xTest = np.asarray(xTest)
xTrain = np.asarray(np.float32(xTrain))
yTest = np.asarray(yTest)
yTrain = np.asarray(np.float32(yTrain))

# for i in range(xTest.shape[0]):
#     testPoint = np.asarray(Featur_Extraction_test(xTest[i]))
#     x_test.append(testPoint)
# x_test = np.asarray(x_test)

for i in range(xTrain.shape[0]):
    for j in range(xTrain.shape[1]):
        if(xTrain[i][j] > 10000 ):
            xTrain[i][j] = 0
print(np.all(np.isfinite(xTrain)))

True


In [103]:
_, xTest, _, yTest = readData()

  xTrain, xTest, yTrain, yTest = train_test_split(np.asarray(xData), np.asarray(yData), test_size = 0.2, shuffle = True)


In [105]:
x_test = []
for i in range(xTest.shape[0]):
    testPoint = np.asarray(Featur_Extraction_test(xTest[i]))
    x_test.append(testPoint)
x_test = np.asarray(x_test)


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [106]:

for i in range(x_test.shape[0]):
    for j in range(x_test.shape[1]):
        if(x_test[i][j] > 10000 ):
            x_test[i][j] = 0
print(np.all(np.isfinite(x_test)))
print(x_test.shape)

True
(337, 1012)


In [108]:
xTest = np.asarray(x_test) 
print(np.all(np.isfinite(xTest)))
print(xTest.shape)

True
(337, 1012)


In [124]:
# def adaboost_classifier(Y_train, X_train, Y_test, X_test, T, clf):
    
#     #TODO: FILL THE FUNCTION with the implementation as the steps above

#     # TODO [1]: Initialize weights
#     w =np.full(X_train.shape[0] , 1 / X_train.shape[0]) 
    
#     ## TODO [2]:  Initialize the training and test data with empty array placeholders
#     #### Hint: what should be their shape?
    
#     pred_train = np.zeros(Y_train.shape) ## predicted classes of the training examples
#     pred_test = np.zeros(Y_test.shape)  ## predicted classes of the test examples

#     ## TODO [3]: loop over the boosting iterations 
#     for i in range(T): 

#         # TODO [4]: Fit a classifier with the specific weights 
#         ## TODO [4.A]: fit the classifier on the training data
#         #### Hint: search how sklearn.tree.DecisionTreeClassifier fits classifier on data
#         ### Hint: search for parameter weights in the fit matrix
#         clf.fit(X_train , Y_train, w)
#         # TODO [4.B]: predict classes for the training data and test data
#         pred_train_i = clf.predict(X_train) 
#         pred_test_i = clf.predict(X_test) 
#         # TODO [5]: calculate the miss Indicator function
#         m_I = (Y_train!= pred_train_i).astype(int)
#         # TODO [6]: calculate the error for the current classifier (err_t)
#         err_t = ( np.sum(w * m_I) ) / (np.sum(w)) 
        
#         # TODO [7]: calculate current classifier weight (Alpha_t)
#         alpha_t = np.log((1 - err_t) / err_t)
        
#         # TODO [8]: update the weights 
#         w = w * np.exp(alpha_t * m_I)
#         w = w / np.sum(w)
#         # TODO [9] Add to the overall predictions
#         pred_train += alpha_t * pred_train_i
#         pred_test += alpha_t * pred_test_i
        
#     pred_train = np.sign(pred_train)
#     pred_test = np.sign(pred_test)
#     print(pred_train)
#     # TODO [10]: Return error rate in train and test set
#     #### Hint: use function get_accuracy from utils.py
#     BayesianClassifierAccuracy = (np.sum(pred_test == Y_test) / len(Y_test)) * 100
#     print("Bayesian Classifier Accuracy: ", BayesianClassifierAccuracy, "%")

#     s = (np.sum(pred_train == Y_train) / len(Y_train)) * 100
#     print("Bayesian Classifier Accuracy: ", s, "%")
#     # train_error = utils.get_accuracy(pred_train,Y_train)
#     # test_error =  utils.get_accuracy(pred_test,Y_test)
#     return BayesianClassifierAccuracy, s

def adaboost_classifier(Y_train, X_train, Y_test, X_test, T, clf):
    
    #TODO: FILL THE FUNCTION with the implementation as the steps above

    # TODO [1]: Initialize weights
    w =np.full(X_train.shape[0] , 1 / X_train.shape[0]) 
    
    ## TODO [2]:  Initialize the training and test data with empty array placeholders
    #### Hint: what should be their shape?
    
    pred_train = np.zeros(Y_train.shape) ## predicted classes of the training examples
    pred_test = np.zeros(Y_test.shape)  ## predicted classes of the test examples

    ## TODO [3]: loop over the boosting iterations 
    for i in range(T): 

        # TODO [4]: Fit a classifier with the specific weights 
        ## TODO [4.A]: fit the classifier on the training data
        #### Hint: search how sklearn.tree.DecisionTreeClassifier fits classifier on data
        ### Hint: search for parameter weights in the fit matrix
        clf.fit(X_train , Y_train, w)
        # TODO [4.B]: predict classes for the training data and test data
        pred_train_i = clf.predict(X_train) 
        pred_test_i = clf.predict(X_test) 
        # TODO [5]: calculate the miss Indicator function
        m_I = (Y_train!= pred_train_i).astype(int)
        # TODO [6]: calculate the error for the current classifier (err_t)
        err_t = ( np.sum(w * m_I) ) / (np.sum(w)) 
        
        # TODO [7]: calculate current classifier weight (Alpha_t)
        alpha_t = np.log((1 - err_t) / err_t)
        
        # TODO [8]: update the weights 
        w = w * np.exp(alpha_t * m_I)
        w = w / np.sum(w)
        # TODO [9] Add to the overall predictions
        pred_train += alpha_t * pred_train_i
        pred_test += alpha_t * pred_test_i
        
    pred_train = np.sign(pred_train)
    pred_test = np.sign(pred_test)
    # TODO [10]: Return error rate in train and test set
    #### Hint: use function get_accuracy from utils.py
    # train_error = utils.get_accuracy(pred_train,Y_train)
    # test_error =  utils.get_accuracy(pred_test,Y_test)
    BayesianClassifierAccuracy = (np.sum(pred_test == Y_test) / len(Y_test)) * 100
    print("Bayesian Classifier Accuracy: ", BayesianClassifierAccuracy, "%")

    s = (np.sum(pred_train == Y_train) / len(Y_train)) * 100
    print("Bayesian Classifier Accuracy: ", pred_train,Y_test, "%")
    return BayesianClassifierAccuracy, s

In [125]:
# Fit a simple decision tree first
clf_tree = DecisionTreeClassifier(max_depth=1, random_state=1)

# Fit Adaboost classifier using a decision tree as base estimator
# Test with different number of iterations
acc_train, acc_test = [],[]
x_range = range(10, 410, 50)
print(xTest.shape)
for i in x_range:
    print('Number of Iterations : ' , i)
    acc_i = adaboost_classifier(yTrain, xTrain, yTest, xTest, i, clf_tree)
    acc_train.append(acc_i[0])
    acc_test.append(acc_i[1])

# Compare error rate vs number of iterations
utils.plot_accuracy(acc_train, acc_test)

(337, 1012)
Number of Iterations :  10
Bayesian Classifier Accuracy:  0.0 %
Bayesian Classifier Accuracy:  [-1. -1. -1. ... -1. -1. -1.] [4 2 1 2 6 8 7 9 4 9 4 2 7 4 2 4 9 3 8 8 5 8 6 6 5 6 2 4 7 8 5 3 6 9 5 4 4
 4 5 1 8 1 7 5 1 6 5 2 1 2 2 4 8 3 1 1 6 2 4 1 9 1 5 7 6 6 4 3 5 6 3 5 8 4
 5 1 7 2 4 6 9 6 4 2 9 9 7 9 1 8 7 2 5 7 3 3 4 7 5 9 1 5 4 5 9 3 1 3 2 8 9
 5 6 6 3 9 5 6 2 1 1 5 6 8 7 5 3 4 4 7 9 3 3 3 5 5 6 2 1 5 2 4 8 8 3 1 2 3
 6 7 7 2 5 5 8 6 5 4 6 4 1 4 3 6 3 2 8 7 5 9 2 7 7 8 4 8 5 2 5 5 5 5 4 3 4
 7 9 8 5 2 5 6 5 4 9 6 9 8 9 3 6 8 3 8 8 9 1 4 2 6 2 9 7 6 4 3 9 3 9 4 5 7
 3 9 6 6 1 4 1 6 5 5 2 4 3 1 7 8 7 2 4 2 5 9 2 3 1 4 6 3 8 7 9 9 4 7 9 2 5
 2 6 8 8 7 5 2 4 1 2 7 2 5 9 6 2 1 9 4 5 1 4 6 4 1 5 8 4 7 5 5 5 7 5 5 2 2
 4 8 3 3 2 9 4 7 2 8 2 7 2 1 1 3 9 4 7 7 2 5 6 7 3 5 9 8 6 9 1 2 1 8 9 8 3
 1 4 8 9] %
Number of Iterations :  60
Bayesian Classifier Accuracy:  0.0 %
Bayesian Classifier Accuracy:  [-1. -1. -1. ... -1. -1. -1.] [4 2 1 2 6 8 7 9 4 9 4 2 7 4 2 4 9 3 8 8 5 8 6 6 5 

NameError: name 'utils' is not defined

In [None]:
print(len(minDistPredictions),yTest)

[2, 6, 1, 2, 3, 1, 6, 8, 8, 1, 1, 3, 0, 0, 6, 7, 0, 4, 7, 7, 3, 7, 1, 4, 0, 5, 3, 0, 4, 0, 3, 1, 1, 7, 2, 5, 1, 0, 2, 1, 1, 0, 1, 0, 4, 4, 5, 2, 0, 0, 3, 7, 6, 1, 2, 2, 7, 1, 1, 2, 7, 7, 4, 6, 1, 1, 1, 8, 5, 1, 1, 1, 6, 5, 2, 3, 3, 8, 4, 4, 6, 0, 6, 0, 0, 8, 2, 7, 8, 2, 3, 2, 1, 1, 4, 1, 1, 1, 8, 0, 1, 0, 1, 2, 1, 4, 2, 4, 1, 1, 7, 1, 5, 8, 4, 0, 8, 7, 1, 7, 2, 5, 2, 7, 5, 3, 8, 1, 4, 8, 2, 0, 1, 3, 1, 4, 2, 8, 5, 1, 5, 8, 1, 5, 3, 1, 4, 6, 1, 2, 2, 7, 6, 0, 7, 0, 7, 1, 5, 3, 2, 5, 0, 1, 7, 0, 0, 1, 7, 1, 2, 1, 2, 5, 7, 1, 1, 0, 6, 2, 8, 5, 1, 8, 7, 3, 1, 1, 8, 2, 6, 6, 1, 7, 4, 5, 3, 3, 2, 6, 6, 2, 8, 5, 5, 3, 0, 1, 6, 3, 5, 2, 0, 7, 4, 1, 1, 6, 0, 4, 0, 5, 0, 6, 1, 0, 1, 6, 0, 6, 8, 2, 1, 1, 8, 7, 8, 0, 1, 0, 1, 5, 4, 7, 2, 2, 5, 6, 8, 0, 1, 1, 3, 7, 3, 3, 6, 5, 7, 0, 8, 2, 5, 6, 6, 1, 0, 8, 3, 8, 8, 2, 5, 3, 3, 7, 8, 1, 3, 4, 1, 6, 7, 6, 4, 1, 2, 2, 3, 1, 1, 0, 7, 2, 2, 7, 3, 2, 8, 3, 1, 4, 7, 1, 2, 7, 7, 2, 1, 0, 6, 2, 1, 1, 8, 3, 0, 8, 7, 1, 5, 4, 0, 1, 3, 1, 4, 0, 3, 2, 2, 1, 5, 

In [None]:
m1 = rgb2gray(io.imread("mark1.jpg"))
Otsu_Threshold = threshold_otsu(m1)   
m1 = m1 < Otsu_Threshold 

m2 = rgb2gray(io.imread("mark2.jpg"))
Otsu_Threshold = threshold_otsu(m2)   
m2 = m2 < Otsu_Threshold 

i1 = rgb2gray(io.imread("ACdata_base/9/1510.JPG"))
Otsu_Threshold = threshold_otsu(i1)   
i1 = i1 < Otsu_Threshold 

img,edges,ske,text,diacritics = preprocess("ACdata_base/9/1510.JPG")
arra  = np.zeros(diacritics.shape)
arra = diacritics == 0

d1 = cv2.matchShapes(diacritics.astype(np.uint8),m2.astype(np.uint8),cv2.CONTOURS_MATCH_I2,0)

img,edges,ske,text,diacritics = preprocess("ACdata_base/8/1358.JPG")

d2 = cv2.matchShapes(diacritics.astype(np.uint8),m2.astype(np.uint8),cv2.CONTOURS_MATCH_I2,0)


print(d1,d2)

In [130]:
from sklearn import svm


In [132]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(xTrain, yTrain)

#Predict the response for test dataset
predict_Test = clf.predict(xTest)
kk = (np.sum(predict_Test == yTest) / len(yTest)) * 100
print("Bayesian Classifier Accuracy: ", kk, "%")

Bayesian Classifier Accuracy:  95.84569732937686 %
