## Data Preprocessing

In [170]:
import pandas as pd
import numpy as np
import pprint
import random

In [171]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [172]:
df["SkinThickness"].unique()

array([35, 29,  0, 23, 32, 45, 19, 47, 38, 30, 41, 33, 26, 15, 36, 11, 31,
       37, 42, 25, 18, 24, 39, 27, 21, 34, 10, 60, 13, 20, 22, 28, 54, 40,
       51, 56, 14, 17, 50, 44, 12, 46, 16,  7, 52, 43, 48,  8, 49, 63, 99],
      dtype=int64)

### Checking if the column contains missing values

In [173]:
df["Glucose"].isnull().values.any()

False

In [174]:
def fill_nan_values(column):
    
    mean = df[column].mean()
    
    df[column].fillna(value =mean , inplace = True )

In [175]:
for column in df.columns[:-1]:
    
    check = df[column].isnull().values.any()
   
    if check == True:
        
        fill_nan_values(column)  # function to fill nan values
        
        print("NAN values has been filled")

In [176]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Helper Functions

In [177]:
# because the labels are in float
df["Outcome"] = df["Outcome"].astype("int")

In [178]:
def train_test_split(df,train_per):
    
    train_df = df.iloc[0:int(train_per*len(df)),:]
    
    test_df = df.iloc[int(train_per*len(df)):,:]
    
    return train_df, test_df

In [179]:
random.seed(0)
train_df,test_df = train_test_split(df,0.90)

In [180]:
print("shapes----> training data: ",train_df.shape,"testing data: ",test_df.shape)

shapes----> training data:  (691, 9) testing data:  (77, 9)


Note :- the data we are passing into the functions is an array 

In [181]:
def determine_feature_type(data):
    
    feature_type={}
     
    for column in data.columns[:-1]:
        
        if type(data[column]) == str or len(np.unique(data[column]))>12:
                
            feature_type[column]="continuous"
                
        else:
                
            feature_type[column]="categorical"
                
    return feature_type

In [182]:
# checks the purity of data if the data contains only one type of label then it returns True else False

def check_purity(data):

    labels = data[:,-1]  # labels
    
    labels_unique = np.unique(labels)
    
    if len(labels_unique) == 1:
        
        return True
    
    else:
        
        return False

In [183]:
# this function is used to assign a label if the data is pure means it conatins only one label then classify will 
# return that label else it will return the label which is present in maximum amount
# Returns the Class/label/outcome

def Classify(data):
    
    labels = data[:,-1]
    
    labels_unique , count  = np.unique(labels , return_counts=True)
   
    max_freq_index = np.argmax(count)   # gives the index where the classes are maximum
        
    class_val = labels_unique[max_freq_index]  # so extracting in labels the exact class
    
    return class_val

In [184]:
# This function is used to find the possible potential splits 
# returns a dict where keys represent col index and the key_value represent the possible unique values of that column

def get_potential_splits(data):
    
    potential_splits = {}
    
    for column in range(data.shape[1]-1):
        
           # making empty list as key values so that we can append later on
        
        unique_col_values = np.unique(data[:,column])
        feature_type_key = labels[column]
        type_of_feature = feature_type[feature_type_key]
        #print(column)
        if type_of_feature == "continuous":
            potential_splits[column]=[]
            for val in range(len(unique_col_values)):  # this loop is used to take the sum of unq values and take thier mean so that its easier to split them

                if val !=0:

                    previous_val = unique_col_values[val-1]

                    current_val = unique_col_values[val]

                    potential_splits[column].append((current_val+previous_val)/2)
                
        else:
            potential_splits[column]=unique_col_values
    return potential_splits

In [185]:
# after finding the potential splits and calculate the best split values we split the data into 2 parts data above and below

def split_data(data,split_col,split_value):
    
    split_column_values = data[:, split_col]   # columns which is taken in account
    
    feature_type_key = labels[split_col]
    type_of_feature = feature_type[feature_type_key]
    
    
    if type_of_feature == "continuous":
    
        data_below = data[split_column_values <= split_value]  # data which has values greater than split value
    
        data_above = data[split_column_values > split_value]  ## data which has values less than split value
    
    else:
        data_below = data[split_column_values == split_value]  # data which has values greater than split value
    
        data_above = data[split_column_values != split_value]
    return data_above,data_below  # returns the data 


now to pass split col and split value first we need to find the best split parameter and to find best split parameter we need to calculate overall entropy andfor overall entropy we need individual node entropy so basicaly<br>
first we need to write entropy function
then pass entropy to calculate overall entropy and then pass overall entropy to determine best feature.

### Entropy

$$ E =  \sum_{K=0}^{1} p^{i}(-log_{e}p^{i}) $$
$$\text{Overall Entropy} = \sum_{node=1}^{2} P E$$

In [186]:
def calculate_entropy(data):  # the data will be the splitted one either data_below or data_above
    
    labels = data[:,-1]   # labels of splitted data or labels of splitted node , first cal for node then overall
    
    _, counts = np.unique(labels,return_counts=True)
    
    probabilities = counts/counts.sum()
    
    entropy = sum(probabilities*(-np.log2(probabilities)) ) # array broadcasting will do the work of summation
    
    return entropy
    

In [187]:
def calculate_overall_entropy(data_above,data_below):
    
    entropy_data_above =  calculate_entropy(data_above)
    
    entropy_data_below =  calculate_entropy(data_below)
    
    n=len(data_above)+len(data_below)
    
    prob_above = len(data_above)/n
    
    prob_below = len(data_below)/n
    
    overall_entropy = (prob_above*entropy_data_above)+(prob_below*entropy_data_below)
    
    return overall_entropy


In [188]:
def determine_best_split(data,potential_splits):
    
    overall_entropy =9999
    
    for split_col in potential_splits:
    
        for split_val in (potential_splits[split_col]):
            
            data_above , data_below = split_data(data , split_col,split_val)
            
            calculated_overall_entropy = calculate_overall_entropy(data_above,data_below)
            
            if calculated_overall_entropy <= overall_entropy:
                
                overall_entropy = calculated_overall_entropy
                
                best_split_col = split_col
                
                best_split_val = split_val
                
    return best_split_col,best_split_val

## Decision Tree Algorithm

we will use and apply all the helper function in sequence to get a tree <br>
Here the tree will be in the format of dictionary whose values are in list and the 0 index of list is specifies the step if the ans to the question is true where as the 1 index of the list specifies the false step.
eg<br>


In [189]:
tree_representation = {"ques1":[{"ques2":["true_ans2","false_ans2"]},"false_ans1"]}
print(tree_representation)

{'ques1': [{'ques2': ['true_ans2', 'false_ans2']}, 'false_ans1']}


In [190]:
df.values

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [191]:
def Decision_Tree_Algorithm(df,counter=0,maximum_depth=12):
    
    #converting data into array as we have defined all the functions by assuming data as an array
    if counter==0:
        global labels
        global feature_type
        feature_type = determine_feature_type(df)
        labels = df.columns
        data = df.values  # first time a dataframe is passed so we need to convert it into array
       
    else:                 # after 1 st iteration it means recursive process and the output of other functions are being passed which is already is in array format
        data = df
        
        
    # Checking node purity, if the node is pure it means we do not need to split it we just need to perform classification
    # Node is pure it means the node already contains one class label so no need to split just classify to get label
    
    if check_purity(data) or (counter == maximum_depth): #or (len(data)<minimum_samples) :       # ways to reduce overfitting
        classified_label = Classify(data)
        print(classified_label)
        return classified_label
    else:                      #if means the node is not pure and we need to split the node furthur
        
        counter+=1
        
        #helper functions
        potential_splits = get_potential_splits(data)
        best_split_col , best_split_val = determine_best_split(data,potential_splits)
        data_above,data_below = split_data(data,best_split_col,best_split_val)
        
        
        #generatig tree
        best_split_feature = labels[best_split_col]
        
        type_of_feature = feature_type[best_split_feature]
        if type_of_feature == "continuous":
            question = str(best_split_feature)+" <= "+str(best_split_val)
            
        else:
            question = str(best_split_feature)+" = "+str(best_split_val)
            
        sub_tree = {question:[]}
        
        #getting answers using recursion
        yes_ans = Decision_Tree_Algorithm(data_below,counter,maximum_depth)
        no_ans = Decision_Tree_Algorithm(data_above,counter,maximum_depth)
        
        if yes_ans==no_ans:
            sub_tree[question].append(yes_ans)
            
        else:
            sub_tree[question].append(yes_ans)
            sub_tree[question].append(no_ans)
            
        return sub_tree

In [192]:
tree = Decision_Tree_Algorithm(train_df,maximum_depth=12)

0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
1.0
0.0
1.0
1.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
0.0


In [193]:
tree

{'Glucose <= 127.5': [{'BMI <= 26.45': [{'BloodPressure <= 94.0': [{'DiabetesPedigreeFunction <= 0.6755': [0.0,
        {'DiabetesPedigreeFunction <= 0.7055': [1.0, 0.0]}]},
      {'Age <= 44.0': [0.0, 1.0]}]},
    {'Age <= 25.5': [{'Glucose <= 94.5': [0.0,
        {'BMI <= 31.3': [0.0,
          {'Insulin <= 76.0': [{'SkinThickness <= 38.0': [{'Age <= 24.5': [{'SkinThickness <= 33.5': [{'SkinThickness <= 25.5': [{'BMI <= 44.099999999999994': [{'BMI <= 32.0': [0.0,
                        1.0]},
                      0.0]},
                    0.0]},
                  1.0]},
                1.0]},
              0.0]},
            {'DiabetesPedigreeFunction <= 0.1345': [1.0,
              {'Pregnancies <= 4.5': [0.0, 1.0]}]}]}]}]},
      {'DiabetesPedigreeFunction <= 0.625': [{'Glucose <= 99.5': [{'Age <= 26.5': [{'SkinThickness <= 43.5': [1.0,
              0.0]},
            {'Glucose <= 28.5': [1.0,
              {'Glucose <= 82.5': [0.0,
                {'DiabetesPedigreeFunction <=

In [194]:
tree.values

<function dict.values>

## Testing Data Classification

In [195]:
def classify_example(example,tree):     # example here depicts 1 single example
    
    question = list(tree.keys())[0]     # 1st question
    split_feature, comparision_operator,split_val = question.split()
    if comparision_operator == "<=":
        if example[split_feature] <= float(split_val):
            answer = tree[question][0]                  # if example feature value is less then it goes to true ans in list

        else:
            answer= tree[question][1]                   # else 1 index is for false answer
        
        
    else:
        if example[split_feature] == split_val:
            answer = tree[question][0]                  # if example feature value is less then it goes to true ans in list
        
        else:
            answer= tree[question][1]   
        
    # base case
    if not isinstance(answer, dict):
        return answer
    
    # recursive part
    else:
        residual_tree = answer
        return classify_example(example, residual_tree)

In [196]:

def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["Outcome"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [197]:
accuracy = calculate_accuracy(test_df, tree)
accuracy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


0.7532467532467533

In [198]:
accuracy = calculate_accuracy(train_df, tree)
accuracy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


0.9927641099855282