# ２値GA
- SVMでクラス分類
    - dataset wine trainが80% testが20%
    - errorを最小に
    - かつ特徴を減らす


In [1]:
import random
import numpy as np
import copy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_wine

wine = load_wine()
data = wine.data
label = wine.target
data.shape

(178, 13)

# 初期化

In [242]:

def init(num_features = 10, num_instances = 100):
    '''
    num_instances X num_featuresの配列を用意する。0か1を持つ
    '''
    proba = np.random.uniform(size=(num_instances, num_features))
    threshold = np.random.uniform(size=(num_instances, num_features))
    return proba >= 0.5

# 評価

In [276]:

def eval(X, X_train,X_valid,y_train,y_valid) -> np.ndarray:
    '''
    Xの評価をする関数。評価値は小さいほどよい。
    return :
        np.ndarray, shape = (1, X.shape[0])
    '''
    accu_list = np.zeros((X.shape[0],1))
    for i,x in enumerate(X):
        
        X_train_new = X_train[:,x]
        X_test_new = X_valid[:,x]
        m = SVC()
        m.fit(X_train_new, y_train)
        predict = m.predict(X_test_new)
        accu = accuracy_score(y_valid, predict)
        accu_list[i] = accu
    return accu_list

# 選択

In [234]:

def election(X, accu_array):
    accu_array = accu_array.ravel()
    p_sum = np.sum(accu_array)
    accu_array = accu_array/p_sum
    choice1,choice2 = np.random.choice(list(range(accu_array.size)), 2,replace = False, p = accu_array)
    return choice1,choice2

# 交叉

In [235]:

def closs_over(x,y):
    num_features = x.size
    point = np.random.choice(num_features)
    
    _x = copy.deepcopy(x)
    _y = copy.deepcopy(y)
    _y[point:] = x[point:]
    _x[point:] = y[point:]
    # 普通にhstackしてもいいよね
    
    return _x, _y

## 突然変異
mutation rate にしたがって個体群から選び、反転させる

In [236]:

def mutation_(X, mutation_rate = 0.01):
    '''
    mutation rate にしたがって個体群から選び、反転させる
    '''
    num_instances, num_features = X.shape
    proba = np.random.random(size = num_instances)
    palette = np.array(range(X.shape[0]))
    mask = proba <= mutation_rate
    mutations = palette[mask]
    for x in mutations:
        point = np.random.choice(num_features)
        X[x,point] = not(X[x,point])
    return X

In [376]:

def svm_ga(X_train,y_train,X_valid,y_valid,X_test,y_test ,init_num_features = 10 , init_num_instances = 100):
    
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_valid = sc.transform(X_valid)
    X_test = sc.transform(X_test)
    X = init(init_num_features, init_num_instances)
    for x in range(100):
        accu = eval(X, X_train, X_valid, y_train, y_valid)
        ele1,ele2 = election(X, error)
        X[[ele1,ele2]]=closs_over(X[ele1], X[ele2])
        mutation_(X)
    accu_test = eval(X,X_train,X_test,y_train,y_test)
    pena_list = np.zeros(X.shape[0])
    for i in range(pena_list.shape[0]):
        pena_list[i] = accu[i]-((1-accu[i])*np.sum(X[i])/X.shape[1])
    return X,accu,accu_test,pena_list

In [377]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.6, random_state=None )
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=0.2, random_state = None)

In [383]:

X,accu,accu_test,pena_list = svm_ga(X_train,y_train,X_valid,y_valid,X_test,y_test,13,100)

In [384]:
for x in np.array(list(range(X.shape[0])))[pena_list==1]:
    print("feature = %d" %X[x].sum())
    print("index = %d"%x)
    

feature = 6
index = 3
feature = 6
index = 20
feature = 6
index = 54


In [380]:
print(X[12])
print(accu_test[12])

[ True False False False  True False  True False False False  True  True
  True]
[0.96261682]


In [385]:
print(X[3])
print(accu_test[3])

[ True False  True  True False False  True  True False False  True False
 False]
[0.97196262]


In [386]:
print(X[20])
print(accu_test[20])

[ True False  True False False  True  True  True False False  True False
 False]
[0.94392523]


In [387]:
print(X[54])
print(accu_test[54])

[False  True False False  True False  True False  True  True False False
  True]
[0.92523364]


In [391]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
data_new = cancer.data
label_new = cancer.target

X_train, X_test, y_train, y_test = train_test_split(data_new, label_new, test_size=0.6, random_state=None )
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=0.2, random_state = None)

In [393]:
X,accu,accu_test,pena_list = svm_ga(X_train,y_train,X_valid,y_valid,X_test,y_test,30,100)

In [394]:
for x in np.array(list(range(X.shape[0])))[pena_list==1]:
    print("feature = %d" %X[x].sum())
    print("index = %d"%x)
    

feature = 17
index = 5
feature = 14
index = 10
feature = 12
index = 20
feature = 20
index = 21
feature = 10
index = 33
feature = 10
index = 37
feature = 17
index = 60
feature = 15
index = 69
feature = 13
index = 79


In [395]:
print(X[10])
print(accu_test[10])

[False False  True False False False  True  True False  True False False
 False False False  True False  True False False  True  True  True False
  True  True False  True  True  True]
[0.95614035]


In [396]:
print(X[5])
print(accu_test[5])

[False False False  True  True  True  True  True  True  True  True False
  True  True False  True  True False False False False  True False False
  True False  True  True  True False]
[0.9502924]


In [404]:
wine = load_wine()
X= wine.data
y = wine.target

In [405]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

svm = SVC()

sfs1 = SFS(svm, 
           k_features=6, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)
feature_names = ("Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium",
                 "Total phenols","Flavanoids","Nonflavanoid phenols","Proanthocyanins",
                 "Color intensity","Hue","OD280/OD315 of diluted wines","Proline")
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
    

sfs1 = sfs1.fit(X, y, custom_feature_names=feature_names)



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.0s finished

[2020-08-29 18:51:23] Features: 1/6 -- score: 0.8146067415730337[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished

[2020-08-29 18:51:23] Features: 2/6 -- score: 0.949438202247191[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s finished

[2020-08-29 18:51:23] Features: 3/6 -- score: 0.9775280898876404[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 o

In [406]:
sfs1.k_feature_names_


('Alcohol',
 'Ash',
 'Alcalinity of ash',
 'Flavanoids',
 'Nonflavanoid phenols',
 'Hue')

In [407]:
sfs1.k_score_

1.0

In [408]:
sbs1 = SFS(svm, 
           k_features=6, 
           forward=False, 
           floating=False, 
           verbose=2,
           scoring='accuracy',
           cv=0)
feature_names = ("Alcohol","Malic acid","Ash","Alcalinity of ash","Magnesium",
                 "Total phenols","Flavanoids","Nonflavanoid phenols","Proanthocyanins",
                 "Color intensity","Hue","OD280/OD315 of diluted wines","Proline")
sfs1 = sbs1.fit(X, y, custom_feature_names=feature_names)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.0s finished

[2020-08-29 18:57:22] Features: 12/6 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.0s finished

[2020-08-29 18:57:22] Features: 11/6 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.0s finished

[2020-08-29 18:57:22] Features: 10/6 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:  

In [409]:
sbs1.k_feature_names_

('Alcohol',
 'Ash',
 'Alcalinity of ash',
 'Flavanoids',
 'Nonflavanoid phenols',
 'Color intensity')

In [410]:
sbs1.k_score_

1.0