In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold

In [2]:
df = pd.read_csv("data/agaricus-lepiota.data", header=None, index_col=None)
#all elements are the same == 'p'
#this removes label 16 from index_cols; does not effect iloc
df = df.drop(16, axis=1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,s,w,w,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,s,w,w,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,s,w,w,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,s,w,w,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,s,w,w,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,s,o,o,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,s,o,o,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,s,o,o,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,s,k,w,w,w,o,e,w,v,l


In [3]:
datum_list = []
for i in range(df.shape[1]):
    uniq_datums = []
    for j in range(df.shape[0]):
        datum = df.iloc[j,i]
        if datum not in uniq_datums:
            uniq_datums.append(datum)
    datum_list.append((len(uniq_datums), uniq_datums))

In [4]:
datum_list

[(2, ['p', 'e']),
 (6, ['x', 'b', 's', 'f', 'k', 'c']),
 (4, ['s', 'y', 'f', 'g']),
 (10, ['n', 'y', 'w', 'g', 'e', 'p', 'b', 'u', 'c', 'r']),
 (2, ['t', 'f']),
 (9, ['p', 'a', 'l', 'n', 'f', 'c', 'y', 's', 'm']),
 (2, ['f', 'a']),
 (2, ['c', 'w']),
 (2, ['n', 'b']),
 (12, ['k', 'n', 'g', 'p', 'w', 'h', 'u', 'e', 'b', 'r', 'y', 'o']),
 (2, ['e', 't']),
 (5, ['e', 'c', 'b', 'r', '?']),
 (4, ['s', 'f', 'k', 'y']),
 (4, ['s', 'f', 'y', 'k']),
 (9, ['w', 'g', 'p', 'n', 'b', 'e', 'o', 'c', 'y']),
 (9, ['w', 'p', 'g', 'b', 'n', 'e', 'y', 'o', 'c']),
 (4, ['w', 'n', 'o', 'y']),
 (3, ['o', 't', 'n']),
 (5, ['p', 'e', 'l', 'f', 'n']),
 (9, ['k', 'n', 'u', 'h', 'w', 'r', 'o', 'y', 'b']),
 (6, ['s', 'n', 'a', 'v', 'y', 'c']),
 (7, ['u', 'g', 'm', 'd', 'p', 'w', 'l'])]

In [5]:
#feature is int feature num, value is char feature val
def get_feature_OHE(feature, value):
    idx = datum_list[feature][1].index(value)
    ret_val = np.zeros(datum_list[feature][0])
    ret_val[datum_list[feature][1].index(value)] = 1
    
    return ret_val

In [6]:
get_feature_OHE(3, 'g')

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [7]:
feature_num = 0
for i in range(len(datum_list)):
    feature_num+=datum_list[i][0]

In [8]:
feature_num

118

In [9]:
NN_input = np.zeros((df.shape[0], feature_num))

In [10]:
for i in range(df.shape[0]):    
    prev_idx = 0
    end_idx = 0
    for j in range(df.shape[1]):
        char = df.iloc[i,j]
        feat_vec = get_feature_OHE(j, char)
        #index better into NN_inp
        feat_length = datum_list[j][0]
        end_idx += feat_length
        NN_input[i,prev_idx:end_idx] = feat_vec
        prev_idx = end_idx        
        

In [11]:
#poisionous = 1 ; edible = 0
# NN_output = NN_input[:,:1]
# NN_input = NN_input[:,2:]

In [22]:
rand_data = train_test_split(NN_input, train_size=0.75)
train_inputs = rand_data[0][:,2:] 
valid_inputs = rand_data[0][:,:1] 
train_outputs = rand_data[1][:,2:] 
valid_outputs = rand_data[1][:,:1]

print(train_inputs.shape)
print(valid_inputs.shape)
print(train_outputs.shape)
print(valid_outputs.shape)

(6093, 116)
(6093, 1)
(2031, 116)
(2031, 1)


In [46]:
def split_in_out(NN_input):
    return (NN_input[:,:1],NN_input[:,2:])

In [57]:
#number of splits
K = 7
kf = KFold(n_splits=K, shuffle=True)
kf.get_n_splits()

train_data_set = []
test_data_set = []
for train_index, test_index in kf.split(NN_input):
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)    
    train_data_set.append(split_in_out(NN_input[train_index,:]))
    test_data_set.append(split_in_out(NN_input[test_index,:])) 

TRAIN: (6963,) TEST: (1161,)
TRAIN: (6963,) TEST: (1161,)
TRAIN: (6963,) TEST: (1161,)
TRAIN: (6963,) TEST: (1161,)
TRAIN: (6964,) TEST: (1160,)
TRAIN: (6964,) TEST: (1160,)
TRAIN: (6964,) TEST: (1160,)


(6964, 116)