In [1]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix

# Import NLTK libraries for natural language processing
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import string
import operator
from copy import deepcopy
from math import log2
from statistics import mean
from collections import Counter

# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to C:\Users\Rohit
[nltk_data]     Ranjan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Rohit
[nltk_data]     Ranjan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rohit Ranjan\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Define a function to extract n-grams from text data
def get_ngrams(data, n):
    tokens = [token for token in data.split(" ") if token != ""]
    return list(ngrams(tokens, n))

# Define a function to extract part-of-speech tags from text data
def get_postag(txt):
    # Define a set of English stopwords
    stop_words = set(stopwords.words('english'))

    # Tokenize the input text into sentences
    tokenized = sent_tokenize(txt)

    # Tokenize the first sentence into words and filter out stopwords
    words_list = nltk.word_tokenize(tokenized[0]) 
    words_list = [w for w in words_list if not w in stop_words]  

    # Get part-of-speech tags for the filtered words
    return nltk.pos_tag(words_list)

# Define a function to build data from a file
def build_data(path_to_data):
    data,uni,bi,tri,pos = [],[],[],[],[]
    file = open(path_to_data)

    for line in file:
        line = line.split(':')
        row = []
        
        # Extract the class and question from the input line
        _class, _question = line[0], line[1]
        row.append(_class)
        row.append(' '.join(_question.split(' ')[1:]).translate(str.maketrans('', '', string.punctuation)).rstrip())

        # Calculate the length of the question
        length = len(row[1].split(' '))
        row.append(length)

        # Extract unigrams, bigrams, and trigrams from the question
        unigram = get_ngrams(row[1], 1)
        row.append(unigram)
        uni.extend(unigram)

        bigram = get_ngrams(row[1], 2)
        row.append(bigram)
        bi.extend(bigram)
        
        trigram = get_ngrams(row[1], 3)
        row.append(trigram)
        tri.extend(trigram)

        # Extract part-of-speech tags from the question
        postag = get_postag(row[1])
        row.append(postag)
        pos.extend(postag)
 
        data.append(row)

    return data, uni, bi, tri, pos

# Call the build_data function to load and process training data
data, uni, bi, tri, pos = build_data('./train_data.txt')

# Print some information about the loaded training data
print('Loading Training data...')
print('Training Data:')
print(data[0:5])

Loading Training data...
Training Data:
[['DESC', 'How did serfdom develop in and then leave Russia', 9, [('How',), ('did',), ('serfdom',), ('develop',), ('in',), ('and',), ('then',), ('leave',), ('Russia',)], [('How', 'did'), ('did', 'serfdom'), ('serfdom', 'develop'), ('develop', 'in'), ('in', 'and'), ('and', 'then'), ('then', 'leave'), ('leave', 'Russia')], [('How', 'did', 'serfdom'), ('did', 'serfdom', 'develop'), ('serfdom', 'develop', 'in'), ('develop', 'in', 'and'), ('in', 'and', 'then'), ('and', 'then', 'leave'), ('then', 'leave', 'Russia')], [('How', 'WRB'), ('serfdom', 'JJ'), ('develop', 'VB'), ('leave', 'JJ'), ('Russia', 'NNP')]], ['ENTY', 'What films featured the character Popeye Doyle', 7, [('What',), ('films',), ('featured',), ('the',), ('character',), ('Popeye',), ('Doyle',)], [('What', 'films'), ('films', 'featured'), ('featured', 'the'), ('the', 'character'), ('character', 'Popeye'), ('Popeye', 'Doyle')], [('What', 'films', 'featured'), ('films', 'featured', 'the'), ('

In [3]:
# Define a function to find the top n-grams from a list of n-grams
def top_grams(grams, top_n):
    return Counter(grams).most_common(top_n)

# Calculate the top 500 unigrams, 300 bigrams, 200 trigrams, and 500 POS tags
unigram_counts = top_grams(uni, 500)
bigram_counts = top_grams(bi, 300)
trigram_counts = top_grams(tri, 200)
pos_counts = top_grams(pos, 500)

# Calculate the average length of questions in the dataset
avg_length = mean([row[2] for row in data])
print('average length:',avg_length)

# Display the top features
print('Top features:\n')
print('Unigrams:\n')
print(unigram_counts[0:5])

print('Bigrams:\n')
print(bigram_counts[0:5])

print('Trigrams:\n')
print(trigram_counts[0:5])

print('Pos Counts:\n')
print(pos_counts[0:5])

average length: 9.031548055759353
Top features:

Unigrams:

[(('the',), 3589), (('What',), 3245), (('is',), 1669), (('of',), 1540), (('in',), 1131)]
Bigrams:

[(('What', 'is'), 968), (('is', 'the'), 757), (('of', 'the'), 446), (('in', 'the'), 326), (('How', 'many'), 316)]
Trigrams:

[(('What', 'is', 'the'), 551), (('What', 'is', 'a'), 151), (('What', 's', 'the'), 135), (('What', 'are', 'the'), 134), (('What', 'was', 'the'), 130)]
Pos Counts:

[(('What', 'WP'), 3245), (('How', 'WRB'), 763), (('Who', 'WP'), 559), (('many', 'JJ'), 332), (('Where', 'WRB'), 273)]


In [4]:
# Define a function to check if a value is numeric (int or float)
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

# Define a list of column headers
header = ['Label', 'Text', 'Length', 'Unigram', 'Bigram', 'Trigram', 'POS']

# Define a class to represent a question
class Question:
    def __init__(self, col, value):
        # The column number in the header
        self.col = col 

        # Actual value of the object
        self.value = value 

    # Method to check if the attribute of the current question matches a given example
    def match(self, example):
        val = example[self.col]
        if is_numeric(val):
            return val <= self.value
        
        return self.value in val

    # Method to return the string representation of the object
    def __repr__(self):
        condition = "contains"
        return "Does %s %s %s?" % (
            header[self.col], condition, str(self.value))

In [5]:
# Function to calculate class counts in a set of rows
def class_counts(rows):
    counts = {}
    for row in rows:
        # Assuming the label is in the first column
        label = row[0]

        if label not in counts:
            counts[label] = 0
        
        counts[label] += 1
    return counts

# Function to calculate Gini impurity of a set of rows
def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

# Function to calculate misclassification error of a set of rows
def misclassifcation_error(rows):
    counts = class_counts(rows)
    max_prob = 0
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        if prob_of_lbl > max_prob:
            max_prob = prob_of_lbl
    return 1 - max_prob

# Function to calculate entropy of a set of rows
def entropy(rows):
    counts = class_counts(rows)
    impurity = 0
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl*log2(prob_of_lbl)
    return impurity

# Function to calculate information gain based on a given impurity function
def info_gain(left, right, current_uncertainty, func):
    p = float(len(left))/(len(left)+len(right))
    return current_uncertainty - p*func(left) - (1-p)*func(right)

In [6]:
# Define a class to represent a leaf node in a decision tree
class Leaf:
    def __init__(self, rows):
        # Store predictions for this leaf node, which are the class counts for the rows it represents
        self.predictions = class_counts(rows)

# Define a class to represent a decision node in a decision tree
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        # Store the question used to split the data at this node
        self.question = question

        # Store the true branch, which is the subtree for examples that satisfy the question
        self.true_branch = true_branch

        # Store the false branch, which is the subtree for examples that do not satisfy the question
        self.false_branch = false_branch

In [7]:
# Initialize an empty list to store questions
questions = []

# Create questions based on the top unigrams
for x in unigram_counts:
    questions.append(Question(3, x[0]))

# Create questions based on the top bigrams
for x in bigram_counts:
    questions.append(Question(4, x[0]))

# Create questions based on the top trigrams    
for x in trigram_counts:
    questions.append(Question(5, x[0]))

# Create questions based on the top POS tags
for x in pos_counts:
    questions.append(Question(6, x[0]))

# Create a question based on the average length of questions    
questions.append(Question(2, avg_length))    

# Print the total number of questions and the first five questions    
print(len(questions))
print(questions[0:5])

1501
[Does Unigram contains ('the',)?, Does Unigram contains ('What',)?, Does Unigram contains ('is',)?, Does Unigram contains ('of',)?, Does Unigram contains ('in',)?]


In [8]:
# Function to partition a set of rows into two subsets based on a given question
def partition(rows, question):
    rows_true = []
    rows_false = []
    
    for r in rows:
        if question.match(r):
            rows_true.append(r)
        else:
            rows_false.append(r)
    
    return rows_true, rows_false

In [9]:
# Function to find the best split (question) among a list of questions for a set of rows
def find_best_split(rows, questions, func):   
    best_gain = 0
    best_question = None
    current_uncertainty = func(rows)
    
    for q in questions:
        # Split the rows based on the current question
        rows_true, rows_false = partition(rows, q)
        
        if len(rows_true) == 0 or len(rows_false) == 0:
            continue
        
        # Calculate the information gain for the current split
        gain = info_gain(rows_true, rows_false, current_uncertainty, func) 
        
        # Update the best gain and best question if the current gain is higher
        if gain >= best_gain:
            best_gain, best_question = gain, q
    
    return best_gain, best_question   

In [10]:
# Recursive function to form a decision tree using partitioning and a list of questions
def form_tree(rows, questions, func):
    # Find the best gain and best question to split the current set of rows
    gain, question = find_best_split(rows, questions, func)

    # If there's no more information gain (gain is 0), create a leaf node with predictions
    if gain == 0:
        return Leaf(rows)
    
    # Split the current set of rows based on the best question
    rows_true, rows_false = partition(rows, question)

    # Remove the used question from the list of available questions
    questions.remove(question)
    
    # Recursively build the true and false branches of the decision tree
    true_branch = form_tree(rows_true, questions, func)
    false_branch = form_tree(rows_false, questions, func)
    
    # Create a decision node with the best question and its branches
    return Decision_Node(question, true_branch, false_branch)

In [11]:
# Function to classify a single row using a decision tree node
def classify_row(node, row):
    if isinstance(node, Leaf):
        # Return the predictions stored in the leaf node
        return node.predictions
    
    # Check if the row satisfies the question at the current node
    if node.question.match(row):
        # Recursively classify the row in the true branch
        return classify_row(node.true_branch, row)
    else:
        # Recursively classify the row in the false branch
        return classify_row(node.false_branch, row)

In [12]:
# Function to train a decision tree using the provided data, list of questions, and impurity function
def train(data, questions, func):
    # Create and return the root node of the decision tree by calling the form_tree function
    return form_tree(data, deepcopy(questions), func)

# Function to classify a set of rows using a trained decision tree
def classify(root, rows):
    # Classify each row using the decision tree and return a list of predictions
    predictions = [max(classify_row(root, r).items(), key=operator.itemgetter(1))[0] for r in rows]
    
    return predictions

In [13]:
# Function to retrieve data entries at specified indices from a data list
def get_data_in_index(data, index):
    l = []
    for i in range(len(data)):
        if i in index:
            l.append(data[i])
    return l

# Function to extract actual labels from a list of data entries
def get_actual_labels(act_data):
    act_labels = []
    
    for d in act_data:
        # Assuming the label is in the first column
        act_labels.append(d[0])
    
    return act_labels

In [14]:
# Create a 10-fold cross-validation splitter
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize lists to store precision, recall, and F1 score for each fold
precision,recall,f_score = [],[],[]

# Loop through the folds
for trainInd, testInd in kfold.split(data):
    # Get training data for the current fold
    train_data = get_data_in_index(data, trainInd)

    # Get test data for the current fold
    test_data = get_data_in_index(data, testInd)
    
    # Train a decision tree on the training data using the Gini index as the impurity function
    root = train(train_data, questions, gini)

    # Classify the test data using the trained decision tree
    prediction = classify(root, test_data)

    # Extract actual labels from the test data
    actual = get_actual_labels(test_data)

    predicted = prediction
    
    # Calculate precision, recall, and F1 score for the current fold and store them in the respective lists
    precision.append(precision_score(actual, predicted, average='macro'))
    recall.append(recall_score(actual, predicted, average='macro'))
    f_score.append(f1_score(actual, predicted, average='macro'))
     
    print("Training...")

# Print the average precision, recall, and F1 score across all folds
print('\nGini Index')
print("Precision Score = "+str(mean(precision)))
print("Recall Score = "+str(mean(recall)))
print("F Score = "+str(mean(f_score)))

Training...
Training...
Training...
Training...
Training...
Training...
Training...
Training...
Training...
Training...

Gini Index
Precision Score = 0.8040501395611818
Recall Score = 0.7487842295806157
F Score = 0.767009336109572


In [16]:
# Define a list of classes
classes = ['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM']

# Function to generate a classification report and other statistics based on the provided flags
def getReport(train_data, test_data, uniFlag=True, biFlag=True, triFlag=True, posFlag=True, lenFlag=True, func=gini):
    allQuestions = []
    
    # Generate questions based on the specified flags
    if uniFlag:
        for x in unigram_counts:
            allQuestions.append(Question(3, x[0]))

    if biFlag:
        for x in bigram_counts:
            allQuestions.append(Question(4, x[0]))

    if triFlag:
        for x in trigram_counts:
            allQuestions.append(Question(5, x[0]))

    if posFlag:
        for x in pos_counts:
            allQuestions.append(Question(6, x[0]))

    if lenFlag:
        allQuestions.append(Question(2, avg_length))    

    print("No of questions = " + str(len(allQuestions)))

    print("Training...")

    # Train a decision tree using specified data and questions
    root = train(train_data, allQuestions, func)
    
    print("Predicting...")

    # Classify the test data using the trained decision tree
    prediction = classify(root, test_data)        

    actual = get_actual_labels(test_data)
    
    print("Prediction done...")

    # Create a confusion matrix
    matrix = confusion_matrix(actual, prediction)

    # Generate a classification report
    class_report = classification_report(actual, prediction)

    acc = matrix.diagonal()/matrix.sum(axis=1)

    # Calculate and store accuracy for each class
    accuracy_report = dict(zip(classes, acc))
    
    return accuracy_report, class_report, root, prediction, actual

# Prepare test data
test_data = build_data('./test_data.txt')[0]
print(len(test_data))

500


In [17]:
# Call getReport to obtain the results
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1501
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.723404255319149, 'HUM': 0.8461538461538461, 'LOC': 0.7037037037037037, 'NUM': 0.8141592920353983}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.76      0.97      0.85       138
        ENTY       0.68      0.72      0.70        94
         HUM       0.92      0.85      0.88        65
         LOC       0.89      0.70      0.79        81
         NUM       0.99      0.81      0.89       113

    accuracy                           0.82       500
   macro avg       0.85      0.79      0.81       500
weighted avg       0.84      0.82      0.82       500



In [18]:
# Call getReport with entropy as the impurity measure
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, func=entropy)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1501
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.5, 'HUM': 0.8615384615384616, 'LOC': 0.7283950617283951, 'NUM': 0.8053097345132744}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.66      0.97      0.79       138
        ENTY       0.69      0.50      0.58        94
         HUM       0.90      0.86      0.88        65
         LOC       0.88      0.73      0.80        81
         NUM       0.98      0.81      0.88       113

    accuracy                           0.79       500
   macro avg       0.83      0.76      0.78       500
weighted avg       0.81      0.79      0.78       500



In [19]:
# Call getReport with misclassifcation_error as the impurity measure
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, func=misclassifcation_error)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1501
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.8260869565217391, 'ENTY': 0.7978723404255319, 'HUM': 0.8461538461538461, 'LOC': 0.691358024691358, 'NUM': 0.7876106194690266}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.77      0.83      0.79       138
        ENTY       0.57      0.80      0.66        94
         HUM       0.92      0.85      0.88        65
         LOC       0.92      0.69      0.79        81
         NUM       0.98      0.79      0.87       113

    accuracy                           0.79       500
   macro avg       0.83      0.77      0.79       500
weighted avg       0.82      0.79      0.80       500



In [20]:
# Call getReport without using the length feature
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1500
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.723404255319149, 'HUM': 0.8461538461538461, 'LOC': 0.7037037037037037, 'NUM': 0.8141592920353983}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.76      0.97      0.85       138
        ENTY       0.68      0.72      0.70        94
         HUM       0.92      0.85      0.88        65
         LOC       0.89      0.70      0.79        81
         NUM       0.99      0.81      0.89       113

    accuracy                           0.82       500
   macro avg       0.85      0.79      0.81       500
weighted avg       0.84      0.82      0.82       500



In [21]:
# Call getReport without using the length feature and with entropy as the impurity measure
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, func=entropy)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1500
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9710144927536232, 'ENTY': 0.5, 'HUM': 0.8615384615384616, 'LOC': 0.7283950617283951, 'NUM': 0.8053097345132744}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.66      0.97      0.79       138
        ENTY       0.69      0.50      0.58        94
         HUM       0.90      0.86      0.88        65
         LOC       0.88      0.73      0.80        81
         NUM       0.98      0.81      0.88       113

    accuracy                           0.79       500
   macro avg       0.83      0.76      0.78       500
weighted avg       0.81      0.79      0.78       500



In [22]:
# Call getReport without using the length feature and with misclassification error as the impurity measure
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, func=misclassifcation_error)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1500
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.8260869565217391, 'ENTY': 0.7978723404255319, 'HUM': 0.8461538461538461, 'LOC': 0.691358024691358, 'NUM': 0.7787610619469026}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.77      0.83      0.79       138
        ENTY       0.56      0.80      0.66        94
         HUM       0.92      0.85      0.88        65
         LOC       0.92      0.69      0.79        81
         NUM       0.98      0.78      0.87       113

    accuracy                           0.79       500
   macro avg       0.83      0.77      0.79       500
weighted avg       0.82      0.79      0.80       500



In [23]:
# Call getReport without using the length feature and POS feature
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, posFlag=False)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1000
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.9782608695652174, 'ENTY': 0.6276595744680851, 'HUM': 0.8461538461538461, 'LOC': 0.654320987654321, 'NUM': 0.7699115044247787}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.73      0.98      0.84       138
        ENTY       0.60      0.63      0.61        94
         HUM       0.87      0.85      0.86        65
         LOC       0.88      0.65      0.75        81
         NUM       1.00      0.77      0.87       113

    accuracy                           0.79       500
   macro avg       0.82      0.76      0.78       500
weighted avg       0.81      0.79      0.79       500



In [24]:
# Call getReport without using the length feature and POS feature, and with entropy as the impurity measure
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, posFlag=False, func=entropy)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1000
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.427536231884058, 'ENTY': 0.648936170212766, 'HUM': 0.8769230769230769, 'LOC': 0.6296296296296297, 'NUM': 0.7699115044247787}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.57      0.43      0.49       138
        ENTY       0.35      0.65      0.45        94
         HUM       0.93      0.88      0.90        65
         LOC       0.82      0.63      0.71        81
         NUM       0.97      0.77      0.86       113

    accuracy                           0.64       500
   macro avg       0.75      0.67      0.69       500
weighted avg       0.71      0.64      0.66       500



In [25]:
# Call getReport without using the length feature and POS feature, and with misclassification error as the impurity measure
accuracy_report, class_report, root, prediction, actual = getReport(train_data=data, test_data=test_data, lenFlag=False, posFlag=False, func=misclassifcation_error)

# Print the accuracy report and classification report
print(accuracy_report)
print(class_report)

No of questions = 1000
Training...
Predicting...
Prediction done...
{'ABBR': 0.6666666666666666, 'DESC': 0.8188405797101449, 'ENTY': 0.7340425531914894, 'HUM': 0.8, 'LOC': 0.654320987654321, 'NUM': 0.7876106194690266}
              precision    recall  f1-score   support

        ABBR       0.86      0.67      0.75         9
        DESC       0.75      0.82      0.78       138
        ENTY       0.50      0.73      0.60        94
         HUM       0.96      0.80      0.87        65
         LOC       0.88      0.65      0.75        81
         NUM       0.98      0.79      0.87       113

    accuracy                           0.76       500
   macro avg       0.82      0.74      0.77       500
weighted avg       0.81      0.76      0.77       500



In [26]:
# Function to collect data entries with wrong predictions
def get_wrong_prediction(prediction, actual, dataset):
    data_list = [dataset[i] for i in range(len(prediction)) if prediction[i] != actual[i]]
    return data_list

# Get accuracy, class matrix, and decision tree for Gini impurity
_ , class_matrix, root_gini, prediction_gini, actual_gini  = getReport(train_data=data, test_data=test_data)
wrong_data = get_wrong_prediction(prediction_gini, actual_gini, test_data)

# Print the number of wrong predictions for Gini impurity
print('Len of wrong data for gini', len(wrong_data))

# Get accuracy, class matrix, and decision tree for Entropy impurity using wrong predictions from Gini
_ , class_matrix, root_entropy, prediction_entropy, actual_entropy  = getReport(train_data=data, test_data=wrong_data, func=entropy)
wrong_data_en = get_wrong_prediction(prediction_entropy, actual_entropy, wrong_data)

# Print the number of wrong predictions for Entropy impurity
print('Len of wrong data for entropy is', len(wrong_data_en))

# Get accuracy, class matrix, and decision tree for Misclassification Error impurity using wrong predictions from Gini
_ , class_matrix, root_mis, prediction_mis, actual_mis  = getReport(train_data=data, test_data=wrong_data, func=misclassifcation_error)
wrong_data_mis = get_wrong_prediction(prediction_entropy, actual_entropy, wrong_data)

# Print the number of wrong predictions for Misclassification Error impurity
print('Len of wrong data for misclassifcation_error is', len(wrong_data_mis))

No of questions = 1501
Training...
Predicting...
Prediction done...
Len of wrong data for gini 88
No of questions = 1501
Training...
Predicting...
Prediction done...
Len of wrong data for entropy is 78
No of questions = 1501
Training...
Predicting...
Prediction done...
Len of wrong data for misclassifcation_error is 78


In [27]:
# Print the number of instances correctly classified by different impurity measures compared to GINI
print('Entropy correctly classifies', (len(wrong_data) - len(wrong_data_en)), 'as compared to GINI metric')
print('Misclassification error correctly classifies', (len(wrong_data) - len(wrong_data_mis)), 'as compared to GINI metric')

Entropy correctly classifies 10 as compared to GINI metric
Misclassification error correctly classifies 10 as compared to GINI metric
