In [371]:
from matplotlib import pyplot as plt
import numpy as np
path="/home/merna/Downloads/dataSet/orl_faces/"
D = []
labels = []
for i in range(1,41):   
    for j in range(1,11):
        img = plt.imread(path+"/s"+str(i)+"/"+str(j)+".pgm")
        #plt.imshow(img)
        listM = np.array(img).flatten()
        #listM = np.concatenate((listM,[i]),axis=0)
        D.append(listM)
        labels.append(i) 
        
D = np.matrix(D) 
labels = np.array(labels)
print(D.shape)

(400, 10304)


In [373]:
#Split the Dataset into Training and Test sets
train_vector = D[0::2,:]
train_label = labels[0::2]
test_vector = D[1::2,:]
test_label = labels[1::2]

In [350]:
def class_counts(data):
    dic_class = {}  # a dictionary of label -> count.
    for i in range(0,len(data)):
        row= np.array(data[i]).flatten()
        # in our dataset format, the label is always the last column
        l = row[len(row)-1]
        if l not in dic_class:
            dic_class[l] = 0
        else:
            dic_class[l] += 1
    return dic_class

In [351]:
class Question_class:

    #Hold the feature and the value

    def __init__(self, feature, value):
        self.feature = feature
        self.value = value

    def check(self, row):
        value_feature = row[self.feature]
        return value_feature >= self.value


In [352]:
def partition(data, question):
   
    false_dataset = []
    true_dataset = []
    for row in data:
        if question.check(row):
            true_dataset.append(row)
        else:
            false_dataset.append(row)
    return true_dataset, false_dataset


In [353]:
import math
def Entropy(data):
    #get the cardinality of each class 
    dic = class_counts(data)
    entropy=0
    for i in dic:
        probability_of_the_class = dic[i] / float(len(data))
        if probability_of_the_class > 0:
            entropy -= (probability_of_the_class)*math.log(probability_of_the_class)
    return entropy

In [354]:
def information_gain(left, right, cur_entropy):
    left_probability = float(len(left)) / (len(left) + len(right))
    right_probability = float(len(right)) / (len(left) + len(right))
    return cur_entropy - left_probability * Entropy(left) + right_probability * Entropy(right)

In [355]:
def find_best_split(data):
    #keep track of the highest information gain
    best_gain = 0
    #keep value of best split
    best_question = None
    #Main Class Entropy (before splitting-->needed to calculate information Gain)
    cur_entropy = Entropy(data)
    #number of feature we subtract one (column for label)
    features = len(data[0]) - 1  

    for feature in range(features):  # for each feature
        #get unique values in data
        values = set([data[feature] for row in data])  
        #
        for i in range(1,len(values)):  # for each value
            #get midpoint
            value = ((values[i]+ values[i-1])/2)
            #make an object carry featur and value
            question = Qestion_class(feature, value)

            # try splitting the dataset
            true_dataset, false_dataset = partition(data, question)

            # Skip this split if it doesn't divide the dataset
            if len(true_dataset) == 0 or len(false_dataset) == 0:
                continue

            # Calculate the information gain from this split
            information_gain_of_this_split = information_gain(true_dataset, false_dataset, cur_entropy)

            # You actually can use '>' instead of '>=' here
            # but I wanted the tree to look a certain way for our
            # toy dataset.
            if information_gain_of_this_split >= best_gain:
                best_gain, best_question = information_gain_of_this_split, question

    return best_gain, best_question

In [356]:
class internal_node:
  #hold question it asked , its false and true branches.

    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch
    

In [364]:
class leaf_node:
    """A Leaf node classifies data.

    This holds a dictionary of class (e.g., "Apple") -> number of times
    it appears in the rows from the training data that reach this leaf.
    """

    def __init__(self, data):
        index=0;
        value=0
        dic=class_counts(data)
        for keys,values in dic.items():
            if values>value:
              value=values
              index=keys
        self.predictions = dic

In [365]:
def build_tree(data):

    #find the best split by calculating information gain and pick the highest one 
    information_gain_value, question = find_best_split(data)

   #base case no information gain i cant ask more questions 
    if information_gain_value == 0:
        return leaf_node(data)

   #partition dataset to true and false tuples based on the question i asked 
    true_dataset, false_dataset = partition(data, question)

    # Repeat step on true dataset
    true_branch = build_tree(true_dataset)

    # repeat steps on false dataset
    false_branch = build_tree(false_dataset)

    #create a node that holds (true partition, false partition , question)
    return internal_node(question, true_branch, false_branch)

In [366]:
#root is a reference to the root of decision tree
root = build_tree(train_vector)

In [367]:
def guess_label(row_data, node):
    """See the 'rules of recursion' above."""

    # Base case: we reached a leaf
    #function returns True if the specified object is of the specified type, otherwise False.
    if isinstance(node, leaf_node):
        return node.predictions

    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.check(row_data):
        return classify(row_data, node.true_branch)
    else:
        return classify(row_data, node.false_branch)



In [375]:
from sklearn.metrics import accuracy_score
predicted_label=[]
#for i in range(0,len(test_vector)):
predicted_label.append(guess_label(test_vector[i], root))
print(test_label)
print(predicted_label)


In [377]:
#random_forest:
from sklearn.ensemble import RandomForestClassifier
lists = [10,30,100,300]
list2= ['entropy','gini']
for n in lists:
    for method in list2:
       clf = RandomForestClassifier(n_estimators=n,criterion=method)
       clf.fit(train_vector, train_label)  
       output_labels = clf.predict(test_vector)
    #calculate accuracy
       print(accuracy_score(output_labels, test_label))
#best tuning trees=300 and criterion = gini

0.765
0.725
0.92
0.845
0.965
0.915
0.965
0.925
