In [3]:
from __future__ import print_function
import random 
import sys

sys.setrecursionlimit(1000000)

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('GALEX_data-extended-feats.csv')

In [6]:
columns = []
count = 0
for col in df.columns:
    if(count == 0):
        label = col
        count += 1
    elif(count<9):
        columns.append(col)
        count += 1
columns.append(label)

In [7]:
columns

['ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'nuv_mag', 'class']

In [8]:
df = df[columns]

In [9]:
df = df.rename(columns={"class":"label"})

In [10]:
columns[-1] = 'label'

In [11]:
df

Unnamed: 0,ra,dec,u,g,r,i,z,nuv_mag,label
0,184.070888,20.253472,22.557287,21.054333,20.820534,20.792364,20.572094,24.287556,0
1,184.368007,20.716907,23.667465,22.849096,21.240696,19.869997,19.308306,24.398521,1
2,184.285775,20.736396,23.121962,20.563263,18.869553,18.267902,17.888218,23.293421,1
3,183.545455,20.992294,22.300468,20.999990,19.390633,18.621809,18.136063,24.048527,1
4,183.702883,21.154447,22.177027,21.408657,21.410803,21.424093,21.027111,24.103436,2
5,183.469467,21.139612,22.841330,21.124943,19.417271,18.719162,18.323605,23.136250,1
6,183.489113,21.028830,21.995867,20.823610,20.994688,20.947767,20.737692,23.360886,2
7,183.837879,21.013788,20.911606,20.432686,20.289907,20.177324,19.817312,23.002201,2
8,183.287500,20.557827,22.567287,20.677015,18.971281,18.349138,17.995657,23.603683,1
9,184.180088,37.692273,26.119047,22.022085,20.746704,19.892279,19.146225,23.813169,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8406 entries, 0 to 8405
Data columns (total 9 columns):
ra         8406 non-null float64
dec        8406 non-null float64
u          8406 non-null float64
g          8406 non-null float64
r          8406 non-null float64
i          8406 non-null float64
z          8406 non-null float64
nuv_mag    8406 non-null float64
label      8406 non-null int64
dtypes: float64(8), int64(1)
memory usage: 591.1 KB


In [13]:
def train_test_split(df,test_size):
    if isinstance(test_size,float):
        test_size = round(test_size * len(df))
        
    if(test_size > len(df)):
        return 0, df

    indices = df.index.tolist()
    test_indices = random.sample(population=indices,k = test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [14]:

train_df, test_df = train_test_split(df,0.2)

In [41]:
data = train_df.values

In [42]:
data

array([[184.0708878 ,  20.25347237,  22.55728722, ...,  20.57209396,
         24.28755569,   0.        ],
       [184.3680069 ,  20.71690691,  23.66746521, ...,  19.30830574,
         24.39852142,   1.        ],
       [184.2857749 ,  20.73639638,  23.12196159, ...,  17.88821793,
         23.29342079,   1.        ],
       ...,
       [177.1139859 ,  31.31678132,  20.51153183, ...,  19.79179573,
         22.30870438,   2.        ],
       [177.2425272 ,  31.09678109,  21.5286274 , ...,  19.92852974,
         22.0302887 ,   2.        ],
       [177.9771951 ,  20.99629456,  21.81986046, ...,  20.88772202,
         23.21204376,   0.        ]])

In [43]:
def check_purity(data):
    label_column = data[:,-1]
    unique_classes = np.unique(label_column)

    if(len(unique_classes) == 1):
        return True
    else:
        return False

In [44]:
check_purity(train_df.values)

False

In [45]:
def classify_data(data):
    label_column = data[: , -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts= True)

    index = counts_unique_classes.argmax()
    classification  = unique_classes[index]
    return classification

In [46]:
def get_potential_splits(data):
    potential_splits = {} #Key is indices of the columns
    _ , n_columns = data.shape

    for column_index in range(n_columns-1): #col - 1 because we need to exclude the last col
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if(index != 0):
                current_value = unique_values[index]
                previous_value = unique_values[index-1]
                potential_split = (current_value + previous_value)/2

                potential_splits[column_index].append(potential_split)
    
    return potential_splits

In [47]:
potential_splits = get_potential_splits(train_df.values)
    


In [48]:
def split_data(data, split_column, split_value):
    split_column_values = data[:,split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values > split_value]
    
    return data_below, data_above

In [49]:
split_column = 3
split_value = 18.1



In [50]:
data_below, data_above = split_data(data, split_column, split_value)

In [51]:
len(data_below)

811

In [52]:
def calculate_entropy(data):
    label_column = data[:, -1]
    _, counts = np.unique(label_column,return_counts =True)

    probabilities = counts / counts.sum()

    entropy = sum(probabilities * -np.log2(probabilities))

    return entropy

In [53]:
def calculate_overall_entropy(data_below, data_above):
    n_data_points = len(data_below) + len(data_above)

    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points

    overall_entropy = (p_data_below)*calculate_entropy(data_below) + (p_data_above)*calculate_entropy(data_above)

    return overall_entropy

In [54]:
calculate_overall_entropy(data_below,data_above)

1.1732438189205814

In [55]:
calculate_entropy(data_above)

1.288878899189575

In [56]:
def determine_best_split(data, potential_splits):
    overall_entropy = 9999
    best_split_column = 1000
    best_split_value = -1
    for column_index in potential_splits:

        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data,split_column=column_index,split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below,data_above)

            if current_overall_entropy <= overall_entropy: #PLOT GRAPH AND CHECK FOR BOTH < AND <= CONDITIONS AND CHOOSE THE BETTER ONE
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
                
    return best_split_column, best_split_value

In [57]:
potential_splits = get_potential_splits(data)

In [58]:
#determine_best_split(data, potential_splits)

In [59]:
df.values

array([[184.0708878 ,  20.25347237,  22.55728722, ...,  20.57209396,
         24.28755569,   0.        ],
       [184.3680069 ,  20.71690691,  23.66746521, ...,  19.30830574,
         24.39852142,   1.        ],
       [184.2857749 ,  20.73639638,  23.12196159, ...,  17.88821793,
         23.29342079,   1.        ],
       ...,
       [177.2425272 ,  31.09678109,  21.5286274 , ...,  19.92852974,
         22.0302887 ,   2.        ],
       [177.9771951 ,  20.99629456,  21.81986046, ...,  20.88772202,
         23.21204376,   0.        ],
       [182.558651  ,  18.9860032 ,  18.3395443 , ...,  17.17955399,
         20.08953094,   0.        ]])

In [60]:
#sub_tree = {question : [yes_answer, no_answer]}

In [61]:
def decision_tree_algorithm(df, counter=0):
    
    #data preperations
    if(counter == 0):
        data = df.values
    else:
        data = df
        
    #base case
    if(check_purity(data)):
        classification = classify_data(data)
        return classification
    
    #recursive part
    else:
        counter += 1
        #print(counter)
        #helper functions
        
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data,potential_splits)
        
        if(split_value == -1):
            return
        
        data_below, data_above = split_data(data, split_column, split_value)
        
        #instantiate sub-tree
        question = "{} <= {}".format(columns[split_column], split_value)
        sub_tree = {question: []}
        
        #find answers (recursion)
        
        yes_answer = decision_tree_algorithm(data_below, counter)
        no_answer = decision_tree_algorithm(data_above, counter)
        
        if yes_answer == no_answer:
            sub_tree = yes_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)

        return sub_tree
    
    
 

In [62]:
def classify_example(example,tree):
        question = list(tree.keys())[0]
        feature_name, comparison, value = question.split()
        if example[feature_name] <= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]

        if not isinstance(answer,dict):
            return answer

        else:
            answer_tree = answer
            return classify_example(example, answer_tree)
   

In [63]:
def calculate_accuracy(df, tree):

    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [57]:

train_df2, test_df2 = train_test_split(df,0.2)

In [None]:
tree2 = decision_tree_algorithm(train_df2)

In [None]:
calculate_accuracy(test_df2,tree2)

In [None]:

train_df3,test_df3 = train_test_split(df, 0.3)

In [None]:
tree3 = decision_tree_algorithm(train_df3)

In [None]:
calculate_accuracy(test_df3,tree3)

In [None]:
train_df3

In [190]:

train_df4,test_df4 = train_test_split(df, 0.1)
tree4 = decision_tree_algorithm(train_df4)
calculate_accuracy(test_df4,tree4)

0.8466111771700356

In [None]:

train_df5,test_df5 = train_test_split(df, 0.5)
tree5 = decision_tree_algorithm(train_df5)
calculate_accuracy(test_df5,tree5)

In [None]:
tree

In [None]:
tree2

In [None]:
tree3

In [None]:
tree4

In [64]:
def calculate_accuracy_random_forest(df, trees):

    df["classification"] = df.apply(classify_example_random_forest, axis=1, args=(trees,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [65]:
def classify_example_random_forest(example,trees):
    count0 = count1 = count2 = 0
    for tree in trees:
        classification =(classify_example(example,tree))
        if(classification == 0):
            count0 += 1
        elif(classification == 1):
            count1 += 1
        elif(classification == 2):
            count2 += 1
    
    label = [count0,count1,count2]
    
    
    fin_class = label.index(max(label))
    return (fin_class)

In [66]:
def random_forest(df,test_size,num_trees):
    total_training_data, test_df = train_test_split(df,test_size)
    training_data = []
    individual_train_data = round(len(total_training_data) / num_trees)
    #print(individual_train_data)
    for i in range(num_trees):
        total_training_data, train_df = train_test_split(total_training_data, individual_train_data)
        
        training_data.append(train_df)
    
    trees = []
    for i in range(num_trees):
        tree = decision_tree_algorithm(training_data[i])
        trees.append(tree)
        
    accuracy = calculate_accuracy_random_forest(test_df, trees)

    #training_data.append(total_training_data)
    return test_df, trees

In [73]:
test_df, trees = random_forest(df,0.2,9)

In [74]:
calculate_accuracy_random_forest(test_df,trees)

0.8709101725163593

In [75]:
tree_accuracy = {}
tree_test_data = {}
for num_trees in range(1,501):
    test_df, trees = random_forest(df,0.2,num_trees)
    accuracy = calculate_accuracy_random_forest(test_df,trees)
    tree_accuracy[num_trees] = accuracy
    tree_test_data[num_trees] = test_df
    print(num_trees,' : ', accuracy)

1  :  0.8643664485425342
2  :  0.8405710886377157
3  :  0.8720999405116002
4  :  0.8720999405116002
5  :  0.8720999405116002
6  :  0.8673408685306365
7  :  0.8786436644854253
8  :  0.8792385484830458
9  :  0.883402736466389
10  :  0.8703152885187388
11  :  0.8822129684711482
12  :  0.8536585365853658
13  :  0.8625817965496728
14  :  0.8572278405710886
15  :  0.8685306365258775
16  :  0.8822129684711482
17  :  0.8661511005353956
18  :  0.8655562165377751
19  :  0.8685306365258775
20  :  0.8655562165377751
21  :  0.8602022605591909
22  :  0.8572278405710886
23  :  0.8697204045211184
24  :  0.8607971445568114
25  :  0.8732897085068412
26  :  0.8590124925639501
27  :  0.8786436644854253
28  :  0.8715050565139798
29  :  0.8607971445568114
30  :  0.8584176085663295
31  :  0.8792385484830458
32  :  0.8625817965496728
33  :  0.8661511005353956
34  :  0.8560380725758477
35  :  0.8447352766210589
36  :  0.8572278405710886
37  :  0.8536585365853658
38  :  0.8703152885187388
39  :  0.8536585365853

TypeError: object of type 'int' has no len()

In [1]:
tree_accuracy

NameError: name 'tree_accuracy' is not defined

In [90]:
example = test_df.iloc[4]
classify_example_random_forest(example,trees)

1

In [91]:
1 == 1.0

True

In [92]:
def calculate_accuracy_random_forest(df, trees):

    df["classification"] = df.apply(classify_example_random_forest, axis=1, args=(trees,))
    df["classification_correct"] = df["classification"] == df["label"]
    
    accuracy = df["classification_correct"].mean()
    
    return accuracy

In [93]:
calculate_accuracy_random_forest(test_df,trees)

0.8958953004164188