In [None]:
# Train a decision tree classifier using sklearn module with entropy as splitting criterion

import numpy as np 
import pandas as pd 
from sklearn.metrics import confusion_matrix 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn import tree
from IPython.display import Image
import pylab as pl
import seaborn as sns
import matplotlib.pyplot as plt  

# importing Dataset 
def importdata():
    balance_data = pd.read_csv(your_file_path, sep= ',') 
    print ("Dataset Length: ", len(balance_data)) 
    print ("Dataset Shape: ", balance_data.shape) 
    print ("Dataset: ",balance_data.head()) 
    return balance_data 

# Spliting the dataset into the training and testing dataset - in a ratio of 70:30 between training and testing 
# The “X ” set consists of predictor variables. The “Y” set consists of the outcome variable.
# Using “.values” of numpy converting our dataframes into numpy arrays
def splitdataset(balance_data):
    X = balance_data.values[:, 2:10] 
    Y = balance_data.values[:, 0]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 42)
    return X, Y, X_train, X_test, y_train, y_test 

# Decision Trees - perform training with entropy. 
def tarin_using_entropy(X_train, X_test, y_train): 
    # max_depth: the max_depth parameter denotes maximum depth of the tree. 
	clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 1, max_depth = 3, min_samples_leaf = 5)
	clf_entropy.fit(X_train, y_train) 
	return clf_entropy 

# Function to make predictions 
def prediction(X_test, clf_object):
	y_pred = clf_object.predict(X_test) 
	print("Predicted values:",len(y_pred),y_pred)
	return y_pred 
    
# Function to calculate accuracy 
def cal_accuracy(y_test, y_pred): 
    target_names = ['Rating 1','Rating 2','Rating 3']
    print("Confusion Matrix of the classifier: ")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    # visually represent a confusion matrix
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax);
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['Rating 1','Rating 2','Rating 3']); ax.yaxis.set_ticklabels(['Rating 1', 'Rating 2','Rating 3']);    
    print ("Accuracy : ", accuracy_score(y_test,y_pred)*100)
    print("Report : ")
    print(classification_report(y_test, y_pred, target_names=target_names))
       
# Driver code 
def main():
    # Building Phase 
    data = importdata() 
    X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
    
    # Classifiers
    clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
    
    # Feature columns in your feature files
    feature_cols = ['MergeComment', 'CommentLength', 'NumberofInstance', 'IncludeBugNumber','KeywordsList1','KeywordsList2','KeywordsList3','NumberofSpecialWords']

    print("Prediction Results:")
    y_pred_entropy = prediction(X_test, clf_entropy) 
    cal_accuracy(y_test, y_pred_entropy)
    
    # Decision tree classifier’s visualization
    dot_data=tree.export_graphviz(clf_entropy,out_file=None,feature_names=feature_cols,class_names=['1','2','3'])
    graph=pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf('entropy-tree.pdf')
                             
# Calling main function 
if __name__=="__main__": 
    main() 