# １０月４日

In [1]:
from IPython.display import Image
import re

# 機械学習関連のライブラリ群

from sklearn.cross_validation import train_test_split # 訓練データとテストデータに分割
from sklearn.metrics import confusion_matrix # 混合行列

from sklearn.decomposition import PCA #主成分分析
from sklearn.linear_model import LogisticRegression # ロジスティック回帰
from sklearn.neighbors import KNeighborsClassifier # K近傍法
from sklearn.svm import SVC # サポートベクターマシン
from sklearn.tree import DecisionTreeClassifier # 決定木
from sklearn.ensemble import RandomForestClassifier # ランダムフォレスト
from sklearn.ensemble import AdaBoostClassifier # AdaBoost
from sklearn.naive_bayes import GaussianNB # ナイーブ・ベイズ
from sklearn.lda import LDA # 線形判別分析
from sklearn.qda import QDA # 二次判別分析

# 数値計算やデータフレーム操作に関するライブラリをインポートする
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
# 図やグラフを図示するためのライブラリをインポートする。
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.tools import plotting

In [2]:
from kcfconvoy.Library import Library

In [3]:
brite = '/Users/kot/Dropbox/kegg/br08007.keg'

In [None]:
library = Library()

In [None]:
alphabet = list('ABCDEFGHIJKLKMOPQRSTUVWXYZ')
hierarchy = {}
group = {}
_id = ''
with open(brite) as f:
    for line in f.readlines():
        head = line[0:1]
        if head in alphabet:
            if head not in hierarchy.keys():
                hierarchy[head] = ''
            if re.search(r'[CD]\d{5}', line):
                _id = line.split()[1]
                #print(_id)
                library.input_from_kegg(_id)
            else:
                hierarchy[head] = line[1:].strip()
            for level, name in hierarchy.items():
                key = level + ":" + name
                if key not in group.keys():
                    group[key] = []
                group[key].append(_id)

In [None]:
len(library.cpds)

In [None]:
group_names = sorted(list(group.keys()))
for i, group_name in enumerate(group_names):
    if len(group[group_name]) < 10:
        continue
    print(i, group_name, len(group[group_name]))

In [None]:
group_names[3]

In [None]:
classes = [1 if id in group[group_names[3]] else 0 for id in library.names]

In [None]:
print(classes)

In [None]:
names = ["Logistic Regression", "Nearest Neighbors", 
         "Linear SVM", "Polynomial SVM", "RBF SVM", "Sigmoid SVM", 
         "Decision Tree","Random Forest", "AdaBoost", "Naive Bayes", 
         "Linear Discriminant Analysis","Quadratic Discriminant Analysis"]

classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(),
    SVC(kernel="linear"),
    SVC(kernel="poly"),
    SVC(kernel="rbf"),
    SVC(kernel="sigmoid"),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()]

In [None]:
library.calc_fingerprints(x=0)
X = [[int(fp.ToBitString()[i:i+1]) for i in range(len(fp.ToBitString()))] for fp in library.fps]
y = classes

In [None]:
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
library.calc_fingerprints(x=1)
X = [[int(fp.ToBitString()[i:i+1]) for i in range(len(fp.ToBitString()))] for fp in library.fps]
y = classes
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
library.calc_fingerprints(x=2)
X = [[int(fp.ToBitString()[i:i+1]) for i in range(len(fp.ToBitString()))] for fp in library.fps]
y = classes
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
library.calc_fingerprints(x=3)
X = [[int(fp.ToBitString()[i:i+1]) for i in range(len(fp.ToBitString()))] for fp in library.fps]
y = classes
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
import kcf.converter as kcfco
#from nxrd.Compound import Compound
!date
kcfvecs = [kcfco.kcf_vec(cpd) for cpd in library.cpds]
!date

In [None]:
!date
kcfvecs_strs = list(set([item for kcfvec in kcfvecs for item in kcfvec.strs]))
!date

In [None]:
!date
kcfvecs2 = []
for kcfvec in kcfvecs:
    vec = np.zeros(len(kcfvecs_strs))
    for _str in enumerate(kcfvec.strs):
        vec[kcfvecs_strs.index(_str[1])] = kcfvec.counts[_str[0]]
    kcfvecs2.append(vec)
!date

In [None]:
X = kcfvecs2
y = classes

In [None]:
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
kcfvecs2

In [None]:
!date
kcfvecs3 = np.array([])
for kcfvec in kcfvecs:
    vec = np.zeros(len(kcfvecs_strs))
    for _str in enumerate(kcfvec.strs):
        vec[kcfvecs_strs.index(_str[1])] = kcfvec.counts[_str[0]]
    kcfvecs3 = np.append(kcfvecs3, vec)
!date

時間かかりすぎ
!date
array = np.array([])
accepted_strs = []
for _str in kcfvecs_strs:
    #ary = []
    #for kcfvec in kcfvecs:
        #print(_str)
    #    if _str[1] in kcfvec.strs:
    ##        ary.append(kcfvec.counts[　.strs.index(_str[1])])
    #        print(kcfvec.counts[kcfvec.strs.index(_str[1])])
    #    else:
    #        ary.append(0)
    ary = np.array([kcfvec.counts[kcfvec.strs.index(_str)] if _str in kcfvec.strs else 0 for kcfvec in kcfvecs])
    #if np.count_nonzero(ary) > len(kcfvecs_strs) / 20:
    array = np.append(array, ary)
    #    accepted_strs.append(_str[1])
    #if len(array) > 10:
    #    break
!date

In [None]:
!date
kcfvecs3 = np.zeros((len(kcfvecs), len(kcfvecs_strs)))
for i, kcfvec in enumerate(kcfvecs):
    #vec = np.zeros(len(kcfvecs_strs))
    for j, _str in enumerate(kcfvec.strs):
        kcfvecs3[i][kcfvecs_strs.index(_str)] = kcfvec.counts[j]
        #print(i, j, kcfvecs_strs.index(_str), kcfvec.counts[j], _str)
    #np.append(kcfvecs3, vec)
    #print(i)
    if i%50 == 0:
        !date
        print(i)
    #    break
!date

In [None]:
pd.DataFrame(kcfvecs3)

In [None]:
kcfvecs3T = kcfvecs3.T

In [None]:
np.where(kcfvecs3T == 0)

In [None]:
kcfvecs3T

In [None]:
plt.hist(kcfvecs3.std(axis = 0), bins=100, range=(0, 0.2))
plt.grid()
plt.show()

In [None]:
plt.hist(kcfvecs3.mean(axis = 0), bins=100, range=(0, 0.02))
plt.grid()
plt.show()

In [None]:
pd.DataFrame(kcfvecs3T)

In [None]:
min_cpd = len(kcfvecs3) / 100
mask_array = np.array([len(np.where(a!=0)[0]) > min_cpd for a in kcfvecs3T])
kcfvecs3T2 = kcfvecs3T[mask_array]

In [None]:
pd.DataFrame(kcfvecs3T2)

In [None]:
kcfvecs4 = kcfvecs3T2.T

In [None]:
pd.DataFrame(kcfvecs4)

In [None]:
X = kcfvecs4
y = classes

In [None]:
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
min_cpd = len(kcfvecs3) / 200
mask_array = np.array([len(np.where(a!=0)[0]) > min_cpd for a in kcfvecs3T])
kcfvecs3T2 = kcfvecs3T[mask_array]
kcfvecs4 = kcfvecs3T2.T
pd.DataFrame(kcfvecs4)

In [None]:
X = kcfvecs4
y = classes
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
min_cpd = len(kcfvecs3) / 400
mask_array = np.array([len(np.where(a!=0)[0]) > min_cpd for a in kcfvecs3T])
kcfvecs3T2 = kcfvecs3T[mask_array]
kcfvecs4 = kcfvecs3T2.T
pd.DataFrame(kcfvecs4)

In [None]:
X = kcfvecs4
y = classes
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors

In [None]:
min_cpd = len(kcfvecs3) / 800
mask_array = np.array([len(np.where(a!=0)[0]) > min_cpd for a in kcfvecs3T])
kcfvecs3T2 = kcfvecs3T[mask_array]
kcfvecs4 = kcfvecs3T2.T
pd.DataFrame(kcfvecs4)

In [None]:
X = kcfvecs4
y = classes
!date
result = []
for trial in range(10): # 10 回繰り返す
    print(trial)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # 訓練データ・テストデータの生成
    for name, clf in zip(names, classifiers): # 指定した複数の分類機を順番に呼び出す
        try:
            clf.fit(X_train, y_train) # 学習
            score1 = clf.score(X_train, y_train) # 正解率（train）の算出
            score2 = clf.score(X_test, y_test) # 正解率（test）の算出
            result.append([name, score1, score2]) # 結果の格納
        except:
            continue

df_result = pd.DataFrame(result, columns=['classifier', 'train', 'test']) # 今回はまだ並べ替えはしない
!date
# 分類器 (classifier) 毎にグループ化して正解率の平均を計算し、test の正解率の平均の大きい順に並べる
df_result_mean = df_result.groupby('classifier').mean().sort_values('test', ascending=False)
# エラーバーの表示に用いる目的で、標準偏差を計算する
errors = df_result.groupby('classifier').std()
# 平均値と標準偏差を用いて棒グラフを描画
df_result_mean.plot(kind='bar', alpha=0.5, grid=True, yerr=errors, ylim=[0, 1])

In [None]:
df_result_mean

In [None]:
errors