In [1]:
from __future__ import print_function
import random 
import sys

sys.setrecursionlimit(1000000)

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('../GALEX_data-extended-feats.csv')

In [4]:
columns = []
count = 0
for col in df.columns:
    if(count == 0):
        label = col
        count += 1
    elif(count<9):
        columns.append(col)
        count += 1
columns.append(label)

In [5]:
columns

['ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'nuv_mag', 'class']

In [6]:
df = df[columns]

In [7]:
df = df.rename(columns={"class":"label"})

In [8]:
columns[-1] = 'label'

In [9]:
df

Unnamed: 0,ra,dec,u,g,r,i,z,nuv_mag,label
0,184.070888,20.253472,22.557287,21.054333,20.820534,20.792364,20.572094,24.287556,0
1,184.368007,20.716907,23.667465,22.849096,21.240696,19.869997,19.308306,24.398521,1
2,184.285775,20.736396,23.121962,20.563263,18.869553,18.267902,17.888218,23.293421,1
3,183.545455,20.992294,22.300468,20.999990,19.390633,18.621809,18.136063,24.048527,1
4,183.702883,21.154447,22.177027,21.408657,21.410803,21.424093,21.027111,24.103436,2
5,183.469467,21.139612,22.841330,21.124943,19.417271,18.719162,18.323605,23.136250,1
6,183.489113,21.028830,21.995867,20.823610,20.994688,20.947767,20.737692,23.360886,2
7,183.837879,21.013788,20.911606,20.432686,20.289907,20.177324,19.817312,23.002201,2
8,183.287500,20.557827,22.567287,20.677015,18.971281,18.349138,17.995657,23.603683,1
9,184.180088,37.692273,26.119047,22.022085,20.746704,19.892279,19.146225,23.813169,1


In [10]:
def train_test_split(df,test_size):
    if isinstance(test_size,float):
        test_size = round(test_size * len(df))
        
    if(test_size > len(df)):
        return 0, df

    indices = df.index.tolist()
    test_indices = random.sample(population=indices,k = test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [42]:
def check_purity(data):
    label_column = data[:,-1]
    unique_classes = np.unique(label_column)

    if(len(unique_classes) == 1):
        return True
    else:
        return False

In [43]:
def classify_data(data):
    label_column = data[: , -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts= True)

    index = counts_unique_classes.argmax()
    classification  = unique_classes[index]
    return classification

In [44]:
def get_potential_splits(data):
    potential_splits = {} #Key is indices of the columns
    _ , n_columns = data.shape

    for column_index in range(n_columns-1): #col - 1 because we need to exclude the last col
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if(index != 0):
                current_value = unique_values[index]
                previous_value = unique_values[index-1]
                potential_split = (current_value + previous_value)/2

                potential_splits[column_index].append(potential_split)
    
    return potential_splits

In [45]:
def split_data(data, split_column, split_value):
    split_column_values = data[:,split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values > split_value]
    
    return data_below, data_above

In [46]:
def calculate_entropy(data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column,return_counts =True)

    probabilities = counts / counts.sum()

    entropy = sum(probabilities * -np.log2(probabilities))

    return entropy

In [47]:
def calculate_overall_entropy(data_below, data_above):
    n_data_points = len(data_below) + len(data_above)

    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points

    overall_entropy = (p_data_below)*calculate_entropy(data_below) + (p_data_above)*calculate_entropy(data_above)

    return overall_entropy

In [48]:
def determine_best_split(data, potential_splits):
    overall_entropy = 9999
    best_split_column = 1000
    best_split_value = -1
    for column_index in potential_splits:

        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data,split_column=column_index,split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below,data_above)

            if current_overall_entropy <= overall_entropy: #PLOT GRAPH AND CHECK FOR BOTH < AND <= CONDITIONS AND CHOOSE THE BETTER ONE
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
                
    return best_split_column, best_split_value

In [49]:
def decision_tree_algorithm(df, counter=0):
    
    #data preperations
    if(counter == 0):
        data = df.values
    else:
        data = df
        
    #base case
    if(check_purity(data)):
        classification = classify_data(data)
        return classification
    
    #recursive part
    else:
        counter += 1
        #print(counter)
        #helper functions
        
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data,potential_splits)
        
        if(split_value == -1):
            return
        
        data_below, data_above = split_data(data, split_column, split_value)
        
        #instantiate sub-tree
        question = "{} <= {}".format(columns[split_column], split_value)
        sub_tree = {question: []}
        
        #find answers (recursion)
        
        yes_answer = decision_tree_algorithm(data_below, counter)
        no_answer = decision_tree_algorithm(data_above, counter)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)

        return sub_tree
    
    
 

In [50]:
def classify_example(example,tree):
        question = list(tree.keys())[0]
        feature_name, comparison, value = question.split()
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]

        if not isinstance(answer,dict):
            return answer

        else:
            answer_tree = answer
            return classify_example(example, answer_tree)
   

In [54]:
def accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [55]:
def recall(df,tree):
    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    true_positive_0 = int(df[(df.label == 0) & (df.classification == 0)]['classification'].count())
    class_0 = int(df[df.label == 0]['label'].count())
    recall_0 = true_positive_0/class_0
    
    true_positive_1 = int(df[(df.label == 1) & (df.classification == 1)]['classification'].count())
    class_1 = int(df[df.label == 1]['label'].count())
    recall_1 = true_positive_1/class_1
    
    true_positive_2 = int(df[(df.label == 2) & (df.classification == 2)]['classification'].count())
    class_2 = int(df[df.label == 2]['label'].count())
    recall_2 = true_positive_2/class_2
    
    return recall_0,recall_1,recall_2

In [66]:
def precision(df,tree):
    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    true_positive_0 = int(df[(df.label == 0) & (df.classification == 0)]['classification'].count())
    total_predicted_0 = int(df[df.classification == 0]['classification'].count())
    precision_0 = true_positive_0/total_predicted_0
    
    true_positive_1 = int(df[(df.label == 1) & (df.classification == 1)]['classification'].count())
    total_predicted_1 = int(df[df.classification == 1]['classification'].count())
    precision_1 = true_positive_1/total_predicted_1
    
    true_positive_2 = int(df[(df.label == 2) & (df.classification == 2)]['classification'].count())
    total_predicted_2 = int(df[df.classification == 2]['classification'].count())
    precision_2 = true_positive_2/total_predicted_2
    
    return precision_0,precision_1,precision_2

In [79]:

train_df,test_df = train_test_split(df, 0.2)
tree = decision_tree_algorithm(train_df)
accuracy(test_df,tree)

0.8661511005353956

In [83]:
precision(test_df,tree)

(0.5449438202247191, 0.9510240427426536, 0.7657894736842106)

In [84]:
recall(test_df,tree)

(0.5843373493975904, 0.9527207850133809, 0.7385786802030457)

In [62]:
def calculate_accuracy_random_forest(df, trees):

    df["classification"] = df.apply(classify_example_random_forest, axis=1, args=(trees,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [72]:
def recall_RF(df,trees):
    df["classification"] = df.apply(classify_example_random_forest, axis=1, args=(trees,))
    true_positive_0 = int(df[(df.label == 0) & (df.classification == 0)]['classification'].count())
    class_0 = int(df[df.label == 0]['label'].count())
    recall_0 = true_positive_0/class_0
    
    true_positive_1 = int(df[(df.label == 1) & (df.classification == 1)]['classification'].count())
    class_1 = int(df[df.label == 1]['label'].count())
    recall_1 = true_positive_1/class_1
    
    true_positive_2 = int(df[(df.label == 2) & (df.classification == 2)]['classification'].count())
    class_2 = int(df[df.label == 2]['label'].count())
    recall_2 = true_positive_2/class_2
    
    return recall_0,recall_1,recall_2

In [73]:
def precision_RF(df,trees):
    df["classification"] = df.apply(classify_example_random_forest, axis=1, args=(trees,))
    true_positive_0 = int(df[(df.label == 0) & (df.classification == 0)]['classification'].count())
    total_predicted_0 = int(df[df.classification == 0]['classification'].count())
    precision_0 = true_positive_0/total_predicted_0
    
    true_positive_1 = int(df[(df.label == 1) & (df.classification == 1)]['classification'].count())
    total_predicted_1 = int(df[df.classification == 1]['classification'].count())
    precision_1 = true_positive_1/total_predicted_1
    
    true_positive_2 = int(df[(df.label == 2) & (df.classification == 2)]['classification'].count())
    total_predicted_2 = int(df[df.classification == 2]['classification'].count())
    precision_2 = true_positive_2/total_predicted_2
    
    return precision_0,precision_1,precision_2

In [69]:
def classify_example_random_forest(example,trees):
    count0 = count1 = count2 = 0
    for tree in trees:
        classification =(classify_example(example,tree))
        if(classification == 0):
            count0 += 1
        elif(classification == 1):
            count1 += 1
        elif(classification == 2):
            count2 += 1
    
    label = [count0,count1,count2]
    
    
    fin_class = label.index(max(label))
    return (fin_class)

In [68]:
def random_forest(df,test_size,num_trees):
    total_training_data, test_df = train_test_split(df,test_size)
    training_data = []
    individual_train_data = round(len(total_training_data) / num_trees)
    #print(individual_train_data)
    for i in range(num_trees):
        total_training_data, train_df = train_test_split(total_training_data, individual_train_data)
        
        training_data.append(train_df)
    
    trees = []
    for i in range(num_trees):
        tree = decision_tree_algorithm(training_data[i])
        trees.append(tree)
        
    accuracy = calculate_accuracy_random_forest(test_df, trees)

    #training_data.append(total_training_data)
    return test_df, trees

In [95]:
test_df, trees = random_forest(df,0.2,16)

In [96]:
calculate_accuracy_random_forest(test_df,trees)*100

88.93515764425936

In [97]:
precision_RF(test_df,trees)

(0.671875, 0.9480069324090121, 0.7894736842105263)

In [98]:
recall_RF(test_df,trees)

(0.5149700598802395, 0.9794091316025068, 0.7934508816120907)

In [88]:
trees

[{'z <= 19.665997505': [{'z <= 18.115825655000002': [{'u <= 18.179059979999998': [{'nuv_mag <= 19.4827919': [1.0,
         0.0]},
       1.0]},
     {'g <= 20.721447945': [{'i <= 19.135770800000003': [{'u <= 19.600082399999998': [{'nuv_mag <= 18.356031415': [0.0,
             2.0]},
           {'ra <= 182.89887385': [2.0,
             {'z <= 18.551968575': [{'nuv_mag <= 21.206502915': [1.0, 2.0]},
               1.0]}]}]},
         {'u <= 21.15413284': [2.0, {'z <= 19.39643478': [2.0, 0.0]}]}]},
       {'nuv_mag <= 21.35088062': [2.0,
         {'z <= 18.9136343': [{'g <= 21.123537065': [{'g <= 21.102749825000004': [1.0,
               0.0]},
             1.0]},
           {'z <= 19.15937233': [{'z <= 19.14511204': [{'ra <= 194.5637349': [1.0,
                 {'ra <= 197.76695654999997': [2.0,
                   {'r <= 20.20472431': [{'u <= 22.47370052': [1.0, 2.0]},
                     1.0]}]}]},
               2.0]},
             {'nuv_mag <= 21.738060949999998': [2.0,
             

In [89]:
len(trees)

16

In [86]:
def decision_tree_algorithm_depth(df, counter=0 ,max_depth = -1):
    
    #data preperations
    if(counter == 0):
        data = df.values
    else:
        data = df
        
    #base case
    if(check_purity(data) or (counter == max_depth)):
        classification = classify_data(data)
        return classification
    
    #recursive part
    else:
        counter += 1
        #print(counter)
        #helper functions
        
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data,potential_splits)
        
        if(split_value == -1):
            return
        
        data_below, data_above = split_data(data, split_column, split_value)
        
        #instantiate sub-tree
        question = "{} <= {}".format(columns[split_column], split_value)
        sub_tree = {question: []}
        
        #find answers (recursion)
        
        yes_answer = decision_tree_algorithm_depth(data_below, counter,max_depth)
        no_answer = decision_tree_algorithm_depth(data_above, counter,max_depth)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)

        return sub_tree
    
    
 

In [87]:
train_df1,test_df1 = train_test_split(df, 0.2)
tree1 = decision_tree_algorithm_depth(train_df1,0,6)
accuracy(test_df1,tree1)
    

0.8471148126115408

In [90]:
precision(test_df1,tree1)

(0.5, 0.937888198757764, 0.7216748768472906)

In [91]:
recall(test_df1,tree1)

(0.4134078212290503, 0.9644160583941606, 0.7216748768472906)

In [99]:
tree2 = decision_tree_algorithm_depth(train_df,0,6)

In [103]:
accuracy(test_df,tree2)*100

87.09101725163593

In [101]:
precision(test_df,tree2)

(0.5454545454545454, 0.9425878320479862, 0.77088948787062)

In [102]:
recall(test_df,tree2)

(0.46706586826347307, 0.9847806624888094, 0.7204030226700252)

In [104]:

tree = decision_tree_algorithm(train_df)
accuracy(test_df,tree)

0.9714455681142177

In [107]:
test_df

Unnamed: 0,ra,dec,u,g,r,i,z,nuv_mag,label,classification,classification_correct
2394,192.792880,31.149498,22.155714,20.918449,19.030468,18.392145,17.909557,22.292679,1,1.0,True
3649,180.448011,22.759893,20.723701,20.063883,20.034578,20.047285,19.781160,22.929361,2,0.0,False
5096,196.127906,23.808310,21.618732,21.117651,21.103891,20.924841,20.355553,21.451908,2,2.0,True
6609,195.132270,21.201915,19.225283,17.948242,17.336678,16.924942,16.697229,20.394958,1,1.0,True
7372,179.456667,24.084580,24.070499,22.178625,20.746027,19.725760,19.433147,23.048000,1,1.0,True
6804,197.837939,32.369771,19.264006,18.062311,17.441936,17.085070,16.883898,19.970732,1,1.0,True
2123,177.326842,31.451225,20.531731,19.473818,19.406977,19.457613,19.562534,21.377195,0,0.0,True
7439,202.548614,19.646154,22.076038,21.451361,21.426132,21.026794,20.525230,21.272669,2,2.0,True
2964,197.954959,21.588874,19.018629,18.053761,17.737740,17.570768,17.463654,20.260857,1,1.0,True
6399,203.814522,21.302630,21.866787,21.249002,21.195812,21.075668,20.818718,23.390537,0,0.0,True
