# oredict olfactory
- 49人それぞれに対して'BAKERY', 'SWEET', 'FRUIT', 'FISH', 'GARLIC','SPICES', 'COLD', 'SOUR', 'BURNT', 'ACID', 'WARM', 'MUSKY', 'SWEATY','AMMONIA/URINOUS', 'DECAYED', 'WOOD', 'GRASS', 'FLOWER', 'CHEMICAL'の19種類に対して予測を行う
- 使用するデータは、Intensity,INTENSITY/STRENGTH,VALENCE/PLEASANTNESS,ECFP4(4096)

In [58]:
import pickle
import pandas as pd
import math

import pubchempy as pcp
from rdkit.Chem import AllChem
from rdkit import Chem

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,median_absolute_error,r2_score
from sklearn.metrics import make_scorer

from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt

import openpyxl

In [59]:
def rmse_score(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = math.sqrt(mse)
    return rmse

def ROC(label_test,pred):
    #分類問題のとき
    #偽陽性と真陽性率の算出
    fpr,tpr,thresholds = roc_curve(label_test,pred)

    #AUCの計算
    auc  = auc(fpr,tpr)

    #ROC曲線の描画
    plt.plot(fpr,tpr,color = 'red',label= 'ROC Curve(area=%.3f)'%auc)
    plt.plot([0,1],[0,1],color= 'black',linestyle = '--')

    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.05])
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('Reciver operating characteristic')
    
def str2int(s):
    return int(s)

def value2onehot(v):
    if v == 0:
        return 0
    else:
        return 1

def make_data(df,person,label,compound_dic):
    
    #使わないcolumnを削除
    df = df.drop('Odor', axis=1)
    df = df.drop('Replicate', axis=1)
    df = df.drop('Dilution', axis=1)

    #被験者を絞り込む処理
    sdf = df[df["subject #"]==str(person)].reset_index()
    sdf = sdf.drop('subject #', axis=1)
    sdf = sdf.drop('index', axis=1)
    sdf['Intensity'] = sdf['Intensity'].map({'low ':0,'high ':1})

    #int型に変換
    sdf['INTENSITY/STRENGTH'] = sdf['INTENSITY/STRENGTH'].apply(str2int)
    sdf['VALENCE/PLEASANTNESS'] = sdf['VALENCE/PLEASANTNESS'].apply(str2int)

    #説明変数とラベルデータを分離させる
    X = sdf.iloc[:,0:2]
    Ys = sdf.iloc[:,2:]

    #ECFP辞書検索
    ecfps = []
    for index in range(len(sdf)):
        data = sdf.loc[index]
        cid = int(data["Compound Identifier"])
        ecfps.append(list(compound_dic[cid]))
    
    #featureごとに列追加
    numOfColumn = len(ecfps[0])
    for c in range(numOfColumn):
        new = []
        for i in range(len(ecfps)):
            new.append(ecfps[i][c])
        X['f'+str(c)] = new
    
    #不要columnを削除
    X = X.drop('Compound Identifier', axis=1)

    #ラベルのset#
    Y = Ys[label]
    Y = Y.apply(str2int)
    #Y = Y.apply(value2onehot)
    
    return X,Y,Ys[label]

In [60]:
#モデルの評価関数 
def evaluate(clf,X,Y,k,label,make_df = 1):
    score_funcs = {
        'rmse': make_scorer(rmse_score),
        'MedAE':'neg_median_absolute_error',
        'R2':'r2',
        'MAE':'neg_mean_absolute_error'
    }
    
    kf=KFold(n_splits=k, shuffle=True, random_state=0)
    score=cross_validate(clf, X, Y, cv = kf,
                        scoring=score_funcs)
                        #,return_train_score=True)
    df = pd.DataFrame(score)
    df.loc[label+"_mean"]=df.mean()
    return df

In [61]:
#化合物データの読み込み
with open('data/compound_dic.pickle','rb') as f:
    compound_dic = pickle.load(f)

In [62]:
#trainDataの読み込み
train_set = []
with open('data/TrainSet.txt') as file:
    for f in file:
        line = f.split('\t')
        line[-1] = line[-1].split('\n')[0]
        train_set.append(line)
        
df = pd.DataFrame(train_set[1:],columns = train_set[0])

In [63]:
person = 1
labels = list(df.iloc[:,6:].columns)
label = labels[1]
print(label)

VALENCE/PLEASANTNESS


In [68]:
len(labels)

21

In [69]:
#データの作成
X,Y,ori= make_data(df,person,label,compound_dic)
#モデルの作成
clf = RandomForestClassifier(n_estimators=10,max_features='auto',oob_score=False,n_jobs=1,random_state=0)

In [70]:
#評価
evaluate_df = evaluate(clf,X,Y,5,label)

In [71]:
evaluate_df

Unnamed: 0,fit_time,score_time,test_rmse,test_MedAE,test_R2,test_MAE
0,0.299402,0.013225,30.052269,-20.5,-0.720881,-23.333333
1,0.305121,0.01167,29.330525,-16.0,-0.682216,-22.013986
2,0.324269,0.011835,27.511091,-19.0,-0.474959,-21.41958
3,0.285286,0.012011,28.563021,-16.0,-0.397311,-21.776224
4,0.273284,0.012086,29.689653,-19.0,-0.569625,-23.405594
VALENCE/PLEASANTNESS_mean,0.297472,0.012165,29.029312,-18.1,-0.568998,-22.389744
