In [4]:
import pandas as pd
import numpy as np
from sklearn import model_selection

In [5]:
# Used To Format Questions
class Question:
    def __init__(self, column, value, root=pd.DataFrame, information_gain=float, truthTable=pd.DataFrame, falseTable=pd.DataFrame, truthQuestion=None, falseQuestion=None):
        self.column = column
        self.value = value

        self.depth = 0
        self.information_gain = information_gain
        
        self.root = root
        self.truthTable = truthTable
        self.falseTable = falseTable

        self.truthQuestion = truthQuestion
        self.falseQuestion = falseQuestion
    
    # Checks if x makes question True or False *used only in prediction*
    def result(self, x=pd.DataFrame):

        if (isinstance(self.value, (int, np.integer, np.floating))):
            if x[x.columns[self.column]][0] >= self.value:
                return self.truthQuestion, self.truthTable["Group"]
            else:
                return self.falseQuestion, self.falseTable["Group"]

        else:
            if x[x.columns[self.column]][0] == self.value:
                return self.truthQuestion, self.truthTable["Group"]
            else:
                return self.falseQuestion, self.falseTable["Group"]


In [6]:
class Tree:
    def __init__(self, df):
        self.root = df
        self.questionTree = []
        self.question = None

        self.max_depth = 10

        self.grow_tree(self.root)
    
    # Generates the best question to ask at every node
    def grow_tree(self, root):
        best_question = self.find_best_question(root)


        if best_question != -1:
            self.questionTree.append(best_question)

            best_question.truthQuestion = self.grow_tree(best_question.truthTable)

            best_question.falseQuestion = self.grow_tree(best_question.falseTable) 
        
            return self.questionTree.index(best_question)
        self.question = self.questionTree[0] 

    # Splits a root based on a question into True Vales, and False Values

    def partition(self, rows=pd.DataFrame, question=Question):

        if (isinstance(question.value, (int, np.integer, np.floating))) == False:
            true_values = rows[rows[rows.columns[question.column]] == question.value]    
            false_values = rows[rows[rows.columns[question.column]] != question.value]
        else:
            true_values = rows[rows[rows.columns[question.column]] >= question.value]    
            false_values = rows[rows[rows.columns[question.column]] < question.value]

        return true_values, false_values

    # Calculate Impurity
    def impurity(self, rows=pd.DataFrame):
        columnLen = len(rows["Group"])
        impurity = 1
        for i in rows["Group"].unique():
            impurity -= ((rows["Group"].value_counts()[i]) / columnLen)**2
        return impurity

    # Check how good a question is by comparing it to the previous node
    def information_gain(self, rootTable=pd.DataFrame, trueTable=pd.DataFrame, falseTable=pd.DataFrame, question=Question):
        rootImpurity = self.impurity(rootTable)

        trueImpurity = self.impurity(trueTable)
        falseImpurity = self.impurity(falseTable)

        AvgImpurity = (trueImpurity * (trueTable.shape[0] / rootTable.shape[0])) + (falseImpurity * (falseTable.shape[0] / rootTable.shape[0])) 
        AvgImpurity /= 2

        return rootImpurity - AvgImpurity

    # Combines Last 3 to find the best question at a single node
    def find_best_question(self, root=pd.DataFrame):
        best_question = -1
        best_information_gain = -1
        pdroot = root
        for column in range(0, len(root.drop(columns=["Group"]).columns)):
            for value in root[root.columns[column]].unique():
                question = Question(column, value, root=pdroot)

                question.truthTable, question.falseTable = self.partition(root, question)

                if question.truthTable.shape[0] == 0 or question.falseTable.shape[0] == 0:
                    continue

                question.information_gain = self.information_gain(root, question.truthTable, question.falseTable, question)
                
                if best_information_gain == None or question.information_gain > best_information_gain:
                    # print("Is ", question.column, " equal too", question.value)
                    best_information_gain = question.information_gain
                    best_question = question

        if len(self.questionTree) != 0:
            if type(best_question) == Question:
                    for i in self.questionTree:
                        if i.truthTable.equals(root) or i.falseTable.equals(root):
                            best_question.depth = i.depth + 1
                            if best_question.depth > self.max_depth:
                                print("THAT RIGHT THERE WAS TOO MUCH, GET IT GONNEN")
                                return -1
            else:
                return -1

        if best_information_gain == 0:
            return -1
        return best_question
    
    # Just to help visualize
    def tree_ToString(self):
        for pos, q in enumerate(self.questionTree):
            print("Question #",pos, ": ", self.root.columns[q.column], " is ", q.value, " ----> Question #", q.truthQuestion, "or Question #", q.falseQuestion)
    
    # Uses the treeQuestions list to predict a x row
    def predict(self, x):
        global prediction
        # print(self.questionTree.index(self.question))
        if self.question.result(x)[0] != None:
               self.question = self.questionTree[self.question.result(x)[0]]
               self.predict(x)
        else:
            prediction = self.question.result(x)[1].value_counts().sort_values(ascending=False).index[0]
        self.question = self.questionTree[0]
             

In [7]:
df = pd.read_csv("alzheimer.csv")

In [8]:
df["M/F"].value_counts().sort_values(ascending=False).index[0]

'F'

In [9]:
df.head(5)

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,Nondemented,M,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,Nondemented,M,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,Demented,M,75,12,,23.0,0.5,1678,0.736,1.046
3,Demented,M,76,12,,28.0,0.5,1738,0.713,1.01
4,Demented,M,80,12,,22.0,0.5,1698,0.701,1.034


In [10]:
# CLEANING

In [11]:
# remove nan
df = df[df["SES"].isin([x for x in range(0, 6)])]

In [12]:
# Changes Male and Female to ones and zeros (unnecessary but i did it)
df["M/F"].replace("M", 0, inplace=True)
df["M/F"].replace("F", 1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["M/F"].replace("M", 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["M/F"].replace("F", 1, inplace=True)
  df["M/F"].replace("F", 1, inplace=True)


In [13]:
# Basically, theres too much nonDemented so we split it...

demented = df[df["Group"] == "Demented"]
nonDemented = df[df["Group"] == "Nondemented"]

demented.reset_index(inplace=True, drop=True)
nonDemented.reset_index(inplace=True, drop=True)

In [14]:
difference = nonDemented.shape[0] - demented.shape[0]
difference

63

In [15]:
# ... Then splice demented to make it match demented
nonDemented = nonDemented[nonDemented.index.isin([x for x in range(0, nonDemented.shape[0] - difference)])]

In [16]:
# Represents demented and nondemented ans 1, and 0 
demented.replace("Demented", 1, inplace=True)
nonDemented.replace("Nondemented", 0, inplace=True)

  demented.replace("Demented", 1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demented.replace("Demented", 1, inplace=True)
  nonDemented.replace("Nondemented", 0, inplace=True)


In [17]:
# Rejoins the split
df = pd.concat([demented, nonDemented], axis=0)

In [18]:
# Cleanup
df.reset_index(inplace=True)
df.drop(columns=["index"], inplace=True)

In [19]:
# Including CDR makes the bot perfect (cuz it becomes obvious), so we dont use it
df.drop(columns=["CDR"], inplace=True)

In [20]:
# MAKING TRAINING DATA

In [21]:
# features are the things we feed to the bot
feature = df.drop(columns=["Group"])

# classifiers are the answers
classifier = df["Group"]

In [22]:
# Separates the data into training data (given to bot with answers), and testing data (given to bot without answers to test accuracy)
x_train, x_test, y_train, y_test = model_selection.train_test_split(feature, classifier, test_size=0.2)

In [23]:
# merge training data
x_train["Group"] = y_train
x_train

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF,Group
195,0,67,12,4.0,30.0,1440,0.727,1.219,0
72,0,84,15,3.0,29.0,1497,0.686,1.172,1
32,1,83,15,2.0,20.0,1476,0.750,1.189,1
214,1,69,18,2.0,29.0,1536,0.719,1.143,0
106,1,75,12,2.0,18.0,1479,0.657,1.187,1
...,...,...,...,...,...,...,...,...,...
16,0,90,12,3.0,21.0,1307,0.679,1.342,1
127,0,87,14,2.0,27.0,1987,0.696,0.883,0
97,1,68,16,1.0,29.0,1344,0.733,1.305,1
43,0,80,14,3.0,29.0,1324,0.695,1.326,1


In [24]:
# Creates the Model
tree = Tree(x_train)

THAT RIGHT THERE WAS TOO MUCH, GET IT GONNEN
THAT RIGHT THERE WAS TOO MUCH, GET IT GONNEN


In [25]:
# Visualization (if it leads to None, then we can predict from there)
tree.tree_ToString()

Question # 0 :  MMSE  is  29.0  ----> Question # 1 or Question # 18
Question # 1 :  Age  is  69  ----> Question # 2 or Question # 13
Question # 2 :  nWBV  is  0.696  ----> Question # 3 or Question # 11
Question # 3 :  eTIV  is  1353  ----> Question # 4 or Question # 10
Question # 4 :  eTIV  is  1891  ----> Question # 5 or Question # 6
Question # 5 :  Age  is  75  ----> Question # None or Question # None
Question # 6 :  M/F  is  1  ----> Question # None or Question # 7
Question # 7 :  eTIV  is  1660  ----> Question # None or Question # 8
Question # 8 :  Age  is  85  ----> Question # None or Question # 9
Question # 9 :  eTIV  is  1651  ----> Question # None or Question # None
Question # 10 :  Age  is  75  ----> Question # None or Question # None
Question # 11 :  EDUC  is  16  ----> Question # None or Question # 12
Question # 12 :  EDUC  is  12  ----> Question # None or Question # None
Question # 13 :  nWBV  is  0.771  ----> Question # 14 or Question # 15
Question # 14 :  M/F  is  1  ----

In [26]:
# Prepares testing data to be assessed by the bot
x_test.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [27]:
# Tests the bot
accuracy = 0
for qNumber in range(x_test.shape[0]):
    row = x_test[x_test.index == qNumber]
    row.reset_index(inplace=True, drop=True)
    print(row)
    prediction = None
    tree.predict(row)
    print(prediction)
    if prediction == y_test[qNumber]:
        accuracy += 1


   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV    ASF
0    1   76    18  2.0  30.0  1379  0.757  1.273
0
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV    ASF
0    0   75    12  4.0  28.0  1511  0.739  1.162
1
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV   ASF
0    1   73    12  4.0  26.0  1451  0.757  1.21
1
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV    ASF
0    0   81    12  3.0  27.0  1814  0.759  0.968
0
   M/F  Age  EDUC  SES  MMSE  eTIV  nWBV    ASF
0    1   88    18  3.0  28.0  1215  0.71  1.444
0
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV  ASF
0    1   78     8  5.0  23.0  1462  0.691  1.2
1
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV    ASF
0    1   72    12  4.0  26.0  1453  0.777  1.208
1
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV   ASF
0    1   70    13  4.0  30.0  1361  0.783  1.29
0
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV    ASF
0    1   78    16  2.0  29.0  1333  0.748  1.316
0
   M/F  Age  EDUC  SES  MMSE  eTIV   nWBV    ASF
0    0   78    14  3.0  30.0  1315  0.707  1.335
0
   M/F  Ag

In [28]:
# Bot Accuracy
accuracy / x_test.shape[0]

0.8235294117647058