In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [2]:
iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

In [3]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

In [4]:
from sklearn.tree import export_graphviz
export_graphiz(
 tree_clf,
 out_file=image_path("iris_tree.dot"),
 feature_names=iris.feature_names[2:],
 class_names=iris.target_names,
 rounded=True,
 filled=True
 )

NameError: name 'export_graphiz' is not defined

In [None]:
from typing import List 
import math 
 
def entropy(class_probabilities: List[float]) -> float: 
    """Given a list of class probabilities, compute the entropy""" 
    return sum(-p * math.log(p, 2) 
               for p in class_probabilities 
               if p > 0)                     # ignore zero probabilities 
 
print(entropy([1.0]) == 0)
print(entropy([0.5, 0.5]) == 1)
print(0.81 < entropy([0.25, 0.75]) < 0.82)



In [None]:
from typing import Any 
from collections import Counter 
def class_probabilities(labels: List[Any]) -> List[float]: 
    total_count = len(labels) 
    return [count / total_count 
        for count in Counter(labels).values()] 
def data_entropy(labels: List[Any]) -> float: 
    return entropy(class_probabilities(labels)) 

print(data_entropy(['a']) == 0 )
print(data_entropy([True, False]) == 1)
print(data_entropy([3, 4, 4, 4]) == entropy([0.25, 0.75]))

In [None]:
def partition_entropy(subsets: List[List[Any]]) -> float: 
    """Returns the entropy from this partition of data into subsets""" 
    total_count = sum(len(subset) for subset in subsets) 
    return sum(data_entropy(subset) * len(subset) / total_count for subset in subsets)

In [None]:
from typing import NamedTuple, Optional, Dict, TypeVar, DefaultDict

In [None]:
class Candidate(NamedTuple): 
    level: str 
    lang: str 
    tweets: bool 
    phd: bool 
    did_well: Optional[bool] = None 

In [None]:
inputs = [Candidate('Senior', 'Java',   False, False, False), 
          Candidate('Senior', 'Java',   False, True,  False), 
          Candidate('Mid',    'Python', False, False, True), 
          Candidate('Junior', 'Python', False, False, True), 
          Candidate('Junior', 'R',      True,  False, True), 
          Candidate('Junior', 'R',      True,  True,  False), 
          Candidate('Mid',    'R',      True,  True,  True), 
          Candidate('Senior', 'Python', False, False, False), 
          Candidate('Senior', 'R',      True,  False, True), 
          Candidate('Junior', 'Python', True,  False, True), 
          Candidate('Senior', 'Python', True,  True,  True), 
          Candidate('Mid',    'Python', False, True,  True), 
          Candidate('Mid',    'Java',   True,  False, True), 
          Candidate('Junior', 'Python', False, True,  False) 
         ]

In [None]:
T = TypeVar('T')

def partition_by(inputs: List[T], attribute: str) -> Dict[Any, List[T]]: 
    """Partition the inputs into lists based on the specified attribute.""" 
    partitions: Dict[Any, List[T]] = DefaultDict(list) 
    for input in inputs: 
        key = getattr(input, attribute)  # value of the specified attribute 
        partitions[key].append(input)    # add input to the correct partition 
    return partitions

In [None]:
def partition_entropy_by(inputs: List[Any], attribute: str, label_attribute: str) -> float: 
    """Compute the entropy corresponding to the given partition""" 
    # partitions consist of our inputs 
    partitions = partition_by(inputs, attribute) 
 
    # but partition_entropy needs just the class labels 
    labels = [[getattr(input, label_attribute) for input in partition] for partition in partitions.values()] 
    return partition_entropy(labels)

In [None]:
for key in ['level','lang','tweets','phd']: 
    print(key, partition_entropy_by(inputs, key, 'did_well')) 
    
print(0.69 < partition_entropy_by(inputs, 'level', 'did_well')  < 0.70)
print(0.86 < partition_entropy_by(inputs, 'lang', 'did_well')   < 0.87)
print(0.78 < partition_entropy_by(inputs, 'tweets', 'did_well') < 0.79)
print(0.89 < partition_entropy_by(inputs, 'phd', 'did_well') < 0.90)

In [None]:
balance_data = pd.read_csv('DTS.csv', names=['Initial Payments', 'Last Payment', 'Credit Score', 'House Number','Sum','Result'],sep=',', header=0)

In [None]:
balance_data

In [None]:
balance_data.shape
balance_data = balance_data.drop('Sum', axis=1)

In [None]:
balance_data.head()

In [None]:
X = balance_data.values[:, 0:4]
y = balance_data.values[:, 4]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

clf_entropy = DecisionTreeClassifier(criterion = 'entropy', random_state=100, max_depth=3, min_samples_leaf=5)
clf_entropy.fit(X_train, y_train)

In [None]:
y_pred_en = clf_entropy.predict(X_test)
y_pred_en

In [None]:
print('level of accuracy: ', accuracy_score(y_test, y_pred_en)*100)

In [None]:
balance_data.values[:, 0:4]

In [None]:
balance_data.values[:, 3]

In [None]:
print(90)