In [47]:
import pandas as pd
import numpy as np

In [48]:
dataset = pd.read_csv('D:/ml/Projects/drug200.csv')

In [49]:
dataset.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [50]:
dataset.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [51]:
dataset["Sex_male"]= np.where(dataset["Sex"]=='F',0,1)

In [52]:
np.random.choice(dataset.columns,3)

array(['Na_to_K', 'Age', 'BP'], dtype=object)

In [53]:
y = dataset.Age

In [54]:
dataset.drop("Sex",inplace=True,axis=1)
dataset.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Sex_male
0,23,HIGH,HIGH,25.355,drugY,0
1,47,LOW,HIGH,13.093,drugC,1
2,47,LOW,HIGH,10.114,drugC,1
3,28,NORMAL,HIGH,7.798,drugX,0
4,61,LOW,HIGH,18.043,drugY,0


In [55]:
bp_encode={"HIGH":3,"LOW":1,"NORMAL":2}
dataset.BP.value_counts()

BP
HIGH      77
LOW       64
NORMAL    59
Name: count, dtype: int64

In [56]:
dataset.BP =dataset.BP.map(bp_encode)
dataset.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Sex_male
0,23,3,HIGH,25.355,drugY,0
1,47,1,HIGH,13.093,drugC,1
2,47,1,HIGH,10.114,drugC,1
3,28,2,HIGH,7.798,drugX,0
4,61,1,HIGH,18.043,drugY,0


In [57]:
dataset.iloc[1,:]

Age                47
BP                  1
Cholesterol      HIGH
Na_to_K        13.093
Drug            drugC
Sex_male            1
Name: 1, dtype: object

In [58]:
cholesterol_count = dataset.Cholesterol.value_counts().sort_values().index
cholesterol_count

Index(['NORMAL', 'HIGH'], dtype='object', name='Cholesterol')

In [59]:
cholesterol_map ={cholesterol_count[i] : i for i in range(len(cholesterol_count))}

In [60]:
print(cholesterol_map)

{'NORMAL': 0, 'HIGH': 1}


In [61]:
dataset.Cholesterol = dataset.Cholesterol.map(cholesterol_map)

In [62]:
dataset.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Sex_male
0,23,3,1,25.355,drugY,0
1,47,1,1,13.093,drugC,1
2,47,1,1,10.114,drugC,1
3,28,2,1,7.798,drugX,0
4,61,1,1,18.043,drugY,0


In [63]:
drug_types=dataset.Drug.value_counts().sort_index().index
drug_types

Index(['drugA', 'drugB', 'drugC', 'drugX', 'drugY'], dtype='object', name='Drug')

In [64]:
drug_map = {drug_types[i] :i for i in range(len(drug_types)) }
drug_map

{'drugA': 0, 'drugB': 1, 'drugC': 2, 'drugX': 3, 'drugY': 4}

In [65]:
dataset.Drug = dataset.Drug.map(drug_map)
dataset.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,Drug,Sex_male
0,23,3,1,25.355,4,0
1,47,1,1,13.093,2,1
2,47,1,1,10.114,2,1
3,28,2,1,7.798,3,0
4,61,1,1,18.043,4,0


In [66]:
feats = np.random.choice(dataset.shape[1],size=1,replace=True)
print(feats)
np.unique(dataset.iloc[:,feats])

[1]


array([1, 2, 3], dtype=int64)

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
X_train,X_test,y_train,y_test = train_test_split(dataset.drop("Drug",axis=1),dataset.Drug,test_size=0.2,shuffle=True,random_state=23)

In [69]:
X_train.Age.value_counts().sort_index()

Age
15    3
16    3
17    1
18    2
19    2
20    4
22    5
23    4
24    3
25    1
26    2
28    6
29    1
30    1
31    4
32    4
34    4
35    3
36    4
37    2
38    3
39    6
40    1
41    4
42    2
43    4
45    2
46    2
47    8
48    2
49    5
50    4
51    4
52    1
53    2
54    1
55    2
56    3
57    1
58    5
59    3
60    3
61    3
62    1
63    1
64    1
65    3
66    2
67    3
68    4
69    3
70    3
72    4
73    1
74    4
Name: count, dtype: int64

In [70]:
X_col = X_train['Age']
com = np.where(X_train['Age']>45,0,1)
com


array([0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 1])

In [71]:
idx = X_train[X_train['Age']>45].index
X_train.loc[idx,'Age'].value_counts()

Age
47    8
49    5
58    5
68    4
51    4
74    4
72    4
50    4
60    3
67    3
56    3
69    3
59    3
65    3
61    3
70    3
46    2
48    2
53    2
66    2
55    2
52    1
54    1
64    1
57    1
63    1
62    1
73    1
Name: count, dtype: int64

In [72]:
X_col = X_train.Age
idx = X_col[X_col>45].index
print(max(X_col[idx]))
print(min(X_col[idx]))
X_train.loc[idx,'Age']
idx1=X_col[X_col<=45].index

74
46


In [73]:
y_train[idx1]
X_train.loc[idx1,"Age"]

172    39
28     39
69     18
180    22
17     43
       ..
123    36
39     15
91     41
182    20
83     38
Name: Age, Length: 81, dtype: int64

In [74]:
X_col = X_train.Age

In [75]:
len(X_col[X_col<45].index)


79

In [76]:
len(X_col[X_col>=45].index)

81

In [77]:
l_idx =np.where(X_col<45,1,0)
l_idx.sum()

79

In [78]:
from DecisionTree import DecisionTree
clf = DecisionTree()

In [79]:
clf.fit(X_train,y_train)

In [80]:
y_predict = clf.predict(X_test)

In [81]:
from decisiontree1 import DecisionTree1
from sklearn import datasets
from sklearn.model_selection import train_test_split
X,y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
#train_test_split(X,y,test_size=0.2,random_state=1)
dt = DecisionTree1(criteria='gini')


In [86]:
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
X_test = pd.DataFrame(X_test)
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,20.66,117.80,991.7,0.10360,0.13040,0.120100,0.088240,0.1992,0.06069,...,21.080,25.41,138.10,1349.0,0.14820,0.37350,0.33010,0.19740,0.3060,0.08503
1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.104300,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.40000,0.16250,0.2364,0.07678
2,9.00,14.40,56.36,246.3,0.07005,0.03116,0.003681,0.003472,0.1788,0.06833,...,9.699,20.07,60.90,285.5,0.09861,0.05232,0.01472,0.01389,0.2991,0.07804
3,12.21,14.09,78.78,462.0,0.08108,0.07823,0.068390,0.025340,0.1646,0.06154,...,13.130,19.29,87.65,529.9,0.10260,0.24310,0.30760,0.09140,0.2677,0.08824
4,12.34,14.95,78.29,469.1,0.08682,0.04571,0.021090,0.020540,0.1571,0.05708,...,13.180,16.85,84.11,533.1,0.10480,0.06744,0.04921,0.04793,0.2298,0.05974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,19.79,25.12,130.40,1192.0,0.10150,0.15890,0.254500,0.114900,0.2202,0.06113,...,22.630,33.58,148.70,1589.0,0.12750,0.38610,0.56730,0.17320,0.3305,0.08465
451,10.75,14.97,68.26,355.3,0.07793,0.05139,0.022510,0.007875,0.1399,0.05688,...,11.950,20.72,77.79,441.2,0.10760,0.12230,0.09755,0.03413,0.2300,0.06769
452,17.20,24.52,114.20,929.4,0.10710,0.18300,0.169200,0.079440,0.1927,0.06487,...,23.320,33.82,151.60,1681.0,0.15850,0.73940,0.65660,0.18990,0.3313,0.13390
453,14.03,21.25,89.79,603.4,0.09070,0.06945,0.014620,0.018960,0.1517,0.05835,...,15.330,30.28,98.27,715.5,0.12870,0.15130,0.06231,0.07963,0.2226,0.07617


In [87]:
dt.fit(X_train,y_train)

KeyError: '[1] not in index'

In [88]:
t = pd.DataFrame({'op':[dt.plot_tree()]})
t.head()

1 		 0 		 0
0 		 13 		 24.72
1 		 0 		 0


Unnamed: 0,op
0,


In [85]:
y_predict = dt.predict(X_test)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
y_predict

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

2 		 4 		 0
1 		 0 		 23
2 		 2 		 0
0 		 3 		 13.934
2 		 4 		 0
1 		 1 		 1
2 		 2 		 0


Unnamed: 0,op
0,


In [None]:
dt.root.feature

3

In [None]:
print(dt.root.thr)

13.934
