In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint 

In [12]:
dataset = pd.read_csv('iris.data', header = None, names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])
dataset

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### 1. Write the Python code to compute entropy and information gain

In [13]:
  
def entropy(target_col):  
    
    elements,counts = np.unique(target_col,return_counts = True)  
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])  
    return entropy  
  
def InfoGain(data,split_attribute_name,target_name="class"):  
         
    #Calculate the entropy of the total dataset  
    total_entropy = entropy(data[target_name])  
      
    ##Calculate the entropy of the dataset  
      
    #Calculate the values and the corresponding counts for the split attribute   
    vals,counts= np.unique(data[split_attribute_name],return_counts=True)  
      
    #Calculate the weighted entropy  
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])  
      
    #Calculate the information gain  
    Information_Gain = total_entropy - Weighted_Entropy  
    return Information_Gain  
  
def ID3(data,originaldata,features,target_attribute_name="class",parent_node_class = None):  
  
    #Define the stopping criteria --> If one of this is satisfied, we want to return a leaf node#  
      
    #If all target_values have the same value, return this value  
    if len(np.unique(data[target_attribute_name])) <= 1:  
        return np.unique(data[target_attribute_name])[0]  
      
    #If the dataset is empty, return the mode target feature value in the original dataset  
    elif len(data)==0:  
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]  
      
    #If the feature space is empty, return the mode target feature value of the direct parent node --> Note that  
    #the direct parent node is that node which has called the current run of the ID3 algorithm and hence  
    #the mode target feature value is stored in the parent_node_class variable.  
      
    elif len(features) ==0:  
        return parent_node_class  
      
    #If none of the above holds true, grow the tree!  
      
    else:  
        #Set the default value for this node --> The mode target feature value of the current node  
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]  
          
        #Select the feature which best splits the dataset  
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] #Return the information gain values for the features in the dataset  
        best_feature_index = np.argmax(item_values)  
        best_feature = features[best_feature_index]  
          
        #Create the tree structure. The root gets the name of the feature (best_feature) with the maximum information  
        #gain in the first run  
        tree = {best_feature:{}}  
          
          
        #Remove the feature with the best inforamtion gain from the feature space  
        features = [i for i in features if i != best_feature]  
          
        #Grow a branch under the root node for each possible value of the root node feature  
          
        for value in np.unique(data[best_feature]):  
            value = value  
            #Split the dataset along the value of the feature with the largest information gain and therwith create sub_datasets  
            sub_data = data.where(data[best_feature] == value).dropna()  
              
            #Call the ID3 algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!  
            subtree = ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)  
              
            #Add the sub tree, grown from the sub_dataset to the tree under the root node  
            tree[best_feature][value] = subtree  
              
        return(tree)      
                  
def predict(query,tree,default = 1):  
    
        for key in list(query.keys()):  
            if key in list(tree.keys()):  

                try:  
                    result = tree[key][query[key]]   
                except:  
                    return default  

                result = tree[key][query[key]]  

                if isinstance(result,dict):  
                    return predict(query,result)  
                else:  
                    return result  

  
def train_test_split(dataset):  
        training_data = dataset.iloc[:80].reset_index(drop=True)#We drop the index respectively relabel the index  
        #starting form 0, because we do not want to run into errors regarding the row labels / indexes  
        testing_data = dataset.iloc[80:].reset_index(drop=True)  
        return training_data,testing_data  
  
training_data = train_test_split(dataset)[0]  
testing_data = train_test_split(dataset)[1]   
  
def test(data,tree):  
        #Create new query instances by simply removing the target feature column from the original dataset and   
        #convert it to a dictionary  
        queries = data.iloc[:,:-1].to_dict(orient = "records")  

        #Create a empty DataFrame in whose columns the prediction of the tree are stored  
        predicted = pd.DataFrame(columns=["predicted"])   

        #Calculate the prediction accuracy  
        for i in range(len(data)):  
            predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)   
        print('The prediction accuracy is: ',(np.sum(predicted["predicted"] == data["class"])/len(data))*100,'%')  
      
tree = ID3(training_data,training_data,training_data.columns[:-1])  
pprint(tree)  
test(testing_data,tree) 

{'petal length': {1.0: 'Iris-setosa',
                  1.1: 'Iris-setosa',
                  1.2: 'Iris-setosa',
                  1.3: 'Iris-setosa',
                  1.4: 'Iris-setosa',
                  1.5: 'Iris-setosa',
                  1.6: 'Iris-setosa',
                  1.7: 'Iris-setosa',
                  1.9: 'Iris-setosa',
                  3.3: 'Iris-versicolor',
                  3.5: 'Iris-versicolor',
                  3.6: 'Iris-versicolor',
                  3.9: 'Iris-versicolor',
                  4.0: 'Iris-versicolor',
                  4.1: 'Iris-versicolor',
                  4.2: 'Iris-versicolor',
                  4.3: 'Iris-versicolor',
                  4.4: 'Iris-versicolor',
                  4.5: 'Iris-versicolor',
                  4.6: 'Iris-versicolor',
                  4.7: 'Iris-versicolor',
                  4.8: 'Iris-versicolor',
                  4.9: 'Iris-versicolor',
                  5.0: 'Iris-versicolor'}}
The prediction accuracy is:

### 2. Write the  Python code to demonstrate conditional probability

In [14]:
from numpy import random as rand

#random seed generetor
rand.seed(0)

#list of age
age_list=[20,30,40,50]

#dict of age for any customer and purchasing customer
cust= {age_list[0]:0, age_list[1]:0, age_list[2]:0, age_list[3]:0}
purch= {age_list[0]:0, age_list[1]:0, age_list[2]:0, age_list[3]:0}

total_purch=0

#number of potential customers
n=100000

#counter for for loop
counter = 0

#sample data generetor
for _ in range(n):
    
    #picks random age from list
    age=rand.choice(age_list)
    #adds one customer to the picked age group
    cust[age] += 1
    
    #custom sample of the relation between customer age and purchase probability 
    prob_purch= float(age) / 100.0
    if (rand.random() <+ prob_purch):
        purch[age] += 1
        total_purch += 1
        
#print result
print("Customers               : " + str(cust))
print("Customers that pucheased: " + str(purch))


#probability of someone purchasing
PE= float(total_purch) / n
print("\nProbability of purchase (PE) is " + str(PE), "\n")

#conditional probability calculator
for _ in range(4):
    
    #picks age from ahe list in order
    age=age_list[counter]
    counter += 1
    
    #probability of any an age from potential customers 
    PF=float(cust[age]) / n
    print("\nPF for age " + str(age) + " is " + str(PF))
    
    #probability of being an specific age group and puchasing
    PFE= float(purch[age]) / float(cust[age])
    print("P(F|E) Probability of purchasing at age " + str(age) + " is " + str(PFE))
    
    #probability of a specific age group perchasing out of all potential customers
    PEF= PF * PFE
    print("P(E,F) at age: " + str(age), " is " + str(PEF))

Customers               : {20: 24972, 30: 24912, 40: 25082, 50: 25034}
Customers that pucheased: {20: 4928, 30: 7516, 40: 10000, 50: 12471}

Probability of purchase (PE) is 0.34915 


PF for age 20 is 0.24972
P(F|E) Probability of purchasing at age 20 is 0.19734102194457792
P(E,F) at age: 20  is 0.04928

PF for age 30 is 0.24912
P(F|E) Probability of purchasing at age 30 is 0.3017019910083494
P(E,F) at age: 30  is 0.07516

PF for age 40 is 0.25082
P(F|E) Probability of purchasing at age 40 is 0.3986922892911251
P(E,F) at age: 40  is 0.09999999999999999

PF for age 50 is 0.25034
P(F|E) Probability of purchasing at age 50 is 0.49816249900135817
P(E,F) at age: 50  is 0.12471


### 3. Write the  Python code to compute Euclidean Distance between data points

In [15]:
import math
# Example points in 3-dimensional space...
x = (5, 6, 7)
y = (8, 9, 9)
distance = math.sqrt(sum([(a - b) ** 2 for a, b in zip(x, y)]))
print("Euclidean distance from x to y: ",distance)

Euclidean distance from x to y:  4.69041575982343


### 4. Write the  Python code to calculate covariance matrix, Eigen values and Eigen vectors

In [19]:
# importing numpy library 
import numpy as np 
  
# create numpy 2d-array 
m = np.array([[1, 2, 3], 
              [2, 3, 4], 
              [4, 5, 6]]) 
  
print("Printing the Original square array:\n", 
      m) 
print()
# finding the covariance matrix
c = np.cov(m)

# printing covariance matrix 
print("Printing the covariance matrix of the given square array:\n", 
      c) 
print()  
# finding eigenvalues and eigenvectors 
w, v = np.linalg.eig(m) 
  
# printing eigen values 
print("Printing the Eigen values of the given square array:\n", 
      w) 
print()  
# printing eigen vectors 
print("Printing eigenvectors of the given square array:\n", 
      v) 

Printing the Original square array:
 [[1 2 3]
 [2 3 4]
 [4 5 6]]

Printing the covariance matrix of the given square array:
 [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

Printing the Eigen values of the given square array:
 [ 1.08309519e+01 -8.30951895e-01  1.01486082e-16]

Printing eigenvectors of the given square array:
 [[ 0.34416959  0.72770285  0.40824829]
 [ 0.49532111  0.27580256 -0.81649658]
 [ 0.79762415 -0.62799801  0.40824829]]


### 5. Write the  Python code to calculate the following
####     Accuracy
####     Misclassification 
####     Type-1 and Type-2 error rates
####     Sensitivity
####     Specificity

In [20]:
from sklearn.metrics import confusion_matrix
 
expected = [1, 1, 0, 1, 0, 0, 1, 0, 0, 0]
predicted = [1, 0, 0, 1, 0, 0, 1, 1, 1, 0]
results = confusion_matrix(expected, predicted)
print(results)

[[4 2]
 [1 3]]


In [21]:
TP = 4
TN = 3
FP = 2
FN = 1

In [22]:
print('Accuracy: ', ((TP + TN) / (TP + TN + FP + FN)))

Accuracy:  0.7


In [23]:
print('Misclassification: ', ((FP + FN) /(TP + TN + FP + FN)))

Misclassification:  0.3


In [24]:
print('Sensitivity: ', (TP / (TP + FN)))

Sensitivity:  0.8


In [25]:
print('Specificity: ', (TN / (TN + FP)))

Specificity:  0.6


In [26]:
print('Type-1 error rate: ', (FP / (FP + TN)))
print('Type-2 error rate: ', (FN / (FN + TP)))

Type-1 error rate:  0.4
Type-2 error rate:  0.2
