In [2]:
import import_ipynb
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

## For data
import pandas as pd
import numpy as np
import random

## For plotting
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns

## For statistical tests
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm

## For machine learning
from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import precision_recall_fscore_support

%matplotlib inline

## Chinese display
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False

## No warnings
import warnings
warnings.filterwarnings('ignore') 

In [30]:
# f_beta
def calculate_fscore(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.3,
                                                        stratify=Y,
                                                        random_state=42)
    clf = linear_model.LogisticRegression(class_weight='balanced',
                                          solver='lbfgs')
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)
    precision, recall, f_beta, support = precision_recall_fscore_support(
        y_true=y_test, y_pred=predicted,beta=1.5)
    return f_beta[1] 

## Feature selection
### 1. No feature selection
#### 1.1 One-hot encoding

In [31]:
df_file = "data/Diarrhea_fillna.tsv"
df1 = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")

y = "细菌结果"
df1[y] = df1[y].map(lambda x: 0 if x == "阴性" else 1)

df1 = pd.get_dummies(df1, drop_first=True)
df1.shape

X_original = df1.drop(y, axis=1)
y_original = df1[y]
f_beta_original = calculate_fscore(X_original, y_original)

print("No feature selection: f_beta of one-hot encoding: ", f_beta_original)

(11600, 383)

No feature selection: f_beta of one-hot encoding:  0.5140648872112878


#### 1.2 Feature embedding

In [32]:
df_file = "data/Diarrhea_embed_219.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df.shape

y = "细菌结果"
X_embed = df.drop(y, axis=1)
y_embed = df[y]
f_beta_embed = calculate_fscore(X_embed, y_embed)

print("No feature selection: f_beta of feature embedding: ", f_beta_embed)

(11600, 220)

No feature selection: f_beta of feature embedding:  0.5069153207816542


#### 1.3 One-hot encoding + feature embedding

In [33]:
df_file = "data/Diarrhea_fillna.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
y = "细菌结果"
df[y] = df[y].map(lambda x: 0 if x == "阴性" else 1)
df_onehot = df[df.dtypes[df.dtypes == 'object'].index]
df_onehot = pd.get_dummies(df_onehot, drop_first=True)
df_onehot = df_onehot.reset_index(drop=True)
df_onehot.shape

df_file = "data/Diarrhea_embed_219.tsv"
df_embed = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df_embed.shape

df = pd.concat([df_onehot, df_embed], axis=1)
df.shape

df_combined = "data/Diarrhea_onehot_embed_593.tsv"
df.to_csv(df_combined, sep="\t", encoding="utf-8")

(11600, 374)

(11600, 220)

(11600, 594)

In [34]:
df_file = "data/Diarrhea_onehot_embed_593.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df.shape

y = "细菌结果"
X_embed = df.drop(y, axis=1)
y_embed = df[y]
df_onehot_embed = calculate_fscore(X_embed, y_embed)

print("No feature selection: f_beta of one-hot encoding and feature embedding: ", df_onehot_embed)

(11600, 594)

No feature selection: f_beta of one-hot encoding and feature embedding:  0.5064027939464494


### 2. Statistical-based methods: One-hot encoding 

In [35]:
df_file = "data/Diarrhea_fillna.tsv"
df2 = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")

y = "细菌结果"
df2[y] = df2[y].map(lambda x: 0 if x == "阴性" else 1)

# relevant features from 'main.ipynb'
features_chi2_anova = [
    'age', '体温', '腹泻量', '腹泻频次', '腹泻天数', '细菌结果', '区县', '性别', '户籍', '职业', '首发症状',
    '发热', '腹胀', '恶心', '腹痛', '腹痛性质', '腹痛部位', '呕吐在腹泻___发生', '腹泻', '腹泻性质',
    '近6个月有无肠道疾病既往史', '进餐地点', '是否家中饲养或接触过宠物', '就诊前是否服用过抗生素',
    '诊断', '诊断类型', '临床处理', '本次就诊是否给予抗生素', '抗生素名称.1', '是否采集', '采样类型'
]

df2 = df2[features_chi2_anova]
df2 = pd.get_dummies(df2, drop_first=True)
df2.shape

X_chi2_anova = df2.drop(y, axis=1)
y_chi2_anova = df2[y]
f_beta_chi2_anova = calculate_fscore(X_chi2_anova, y_chi2_anova)
print("Statistical-based method: f_beta of one-hot encoding: ", f_beta_chi2_anova)

(11600, 358)

Statistical-based method: f_beta of one-hot encoding:  0.5150610583446404


### 3. Reinforcement Learning-based methods

In [2]:
df_file = "data/Diarrhea_fillna.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df.shape

(11600, 46)

In [3]:
y = "细菌结果"
df[y] = df[y].map(lambda x: 0 if x == "阴性" else 1)
df[y].value_counts()

0    9231
1    2369
Name: 细菌结果, dtype: int64

In [4]:
# one-hot encoding
df = pd.get_dummies(df, drop_first=True)
df.shape
# 382 features (excluding the label column)

(11600, 383)

In [5]:
y = '细菌结果'
names = df.columns
scaler = preprocessing.MinMaxScaler().fit(df)
df = scaler.transform(df)
df = pd.DataFrame(df, columns=names)

df_normal_file = "data/Diarrhea_onehot_382.tsv"
df.to_csv(df_normal_file, sep="\t", encoding="utf-8")

#### 3.1 One-hot encoding

To repeat the RL-based experiment, you can run it from here.

In [26]:
df_file = "data/Diarrhea_onehot_382.tsv"
df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
df.shape

(11600, 383)

#### 3.2 Feature embedding

In [5]:
# df_file = "data/Diarrhea_embed_219.tsv"
# df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
# df.shape

(11600, 220)

#### 3.3 One-hot encoding + feature embedding

In [24]:
# df_file = "data/Diarrhea_onehot_embed_593.tsv"
# df = pd.read_csv(df_file, sep="\t", index_col=0, encoding="utf-8")
# df.shape

(11600, 594)

Choose any one of the above three feature encoding methods.

In [27]:
# Stratified sampling
df_train, df_test = train_test_split(df, test_size=0.3,
                                     stratify=df[y], random_state=42)

# print info
print("X_train shape:", df_train.drop(y, axis=1).shape,
      "| X_test shape:", df_test.drop(y, axis=1).shape)
print("y_train mean:", round(
    np.mean(df_train[y]), 2), "| y_test mean:", round(np.mean(df_test[y]), 2))

print('-'*50)

print("Train set：")
print(df_train[y].value_counts() / len(df_train[y]))
print("Test set：")
print(df_test[y].value_counts() / len(df_test[y]))


X_train shape: (8120, 382) | X_test shape: (3480, 382)
y_train mean: 0.2 | y_test mean: 0.2
--------------------------------------------------
Train set：
0.0    0.795813
1.0    0.204187
Name: 细菌结果, dtype: float64
Test set：
0.0    0.79569
1.0    0.20431
Name: 细菌结果, dtype: float64


In [28]:
y = '细菌结果'
features_raw = df.drop(y, axis=1).columns.to_list()

X_train = df_train[features_raw]
y_train = df_train[y]

X_test = df_test[features_raw]
y_test = df_test[y]

In [8]:
# f_beta
def accuracy(input):
    # x_train = X_train[input]
    x_train = X_train.iloc[:, input]
    x_test = X_test.iloc[:, input]
    clf = linear_model.LogisticRegression(class_weight='balanced')
    clf.fit(x_train, y_train.values.ravel())
    predicted = clf.predict(x_test)
    precision, recall, f_beta, support = precision_recall_fscore_support(
        y_true=y_test, y_pred=predicted, beta=1.5)
    return f_beta[1]

In [9]:
## Initialize Q-table with -1
# Q_values = [[-1, -1]] * len(features_raw)
Q_values = [[-1, -1] for i in range(len(features_raw))]
Q_values[:3]

## How to add a priori weights to the features? 
# (run "Statistical-based methods: One-hot encoding" first and then run the following code)
# 1. get the column names of the columns of df2, or X_chi2_anova, so that there are no y label
# 2. find the subscripts i corresponding to these column names in df
# 3. set the second column of row i in Q_values to 0 > 1

## add a priori weights to the features
for ix, column in enumerate(X_train.columns):
    if column in X_chi2_anova.columns:
        Q_values[ix][1] = 0

Q_values[:3]

[[-1, -1], [-1, -1], [-1, -1]]

[[-1, 0], [-1, 0], [-1, -1]]

In [18]:
def get_reward(features):
    if len(features) == 0:
        return 0
    acc = accuracy(features) * 100
    tot_f = len(features)
    R = acc
    if tot_f > K:
        R = acc * K / tot_f
    return R

In [19]:
# epsilon-greedy policy
epsilon = 0.5
alpha = 0.2

# attenuation coefficient
epsilon_decay_rate = 0.995
alpha_decay_rate = 0.995

# Maximum number of features allowed
K = 400

In [20]:
all_rewards = []
num_episodes = 100

# Assign an agent to each feature
num_agents = len(features_raw)

# Initialize the reward matrix with 0
reward_store = {}
for i in range(num_agents + 1):
    reward_store[i] = 0

In [21]:
# Initialize the action space with 0
actions = [0] * num_agents

for episode in range(num_episodes):
    for agent in range(num_agents):
        rand_number = random.uniform(0, 1)
        if rand_number > epsilon:
            # actions[agent]  = Q_values[agent].index(max(Q_values[agent]))
            actions[agent] = np.argmax(Q_values[agent])
        else:
            actions[agent] = random.choice([0, 1])
    features = []
    for i, act in enumerate(actions):
        if act == 1:
            features.append(i)
    # print(features)
    R = get_reward(features)
    reward_store[len(features)] = max(reward_store[len(features)], R)
    # print(R)
    all_rewards.append(R)

    for agent in range(num_agents):
        actions[agent] = 1 - actions[agent]
        features = []
        for i, act in enumerate(actions):
            if act == 1:
                features.append(i)
        # CLEAN Reward：C_i = G(a-a_i+c_i) - G(a)
        C_agent = get_reward(features) - R
        Q_values[agent][actions[agent]] = Q_values[agent][actions[agent]] + alpha * (C_agent - Q_values[agent][actions[agent]])
    alpha = alpha * alpha_decay_rate
    epsilon = epsilon * epsilon_decay_rate

In [167]:
reward_store

{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 0, 27: 0, 28: 0, 29: 0, 30: 0, 31: 0, 32: 0, 33: 0, 34: 0, 35: 0, 36: 0, 37: 0, 38: 0, 39: 0, 40: 0, 41: 0, 42: 0, 43: 0, 44: 0, 45: 0, 46: 0, 47: 0, 48: 0, 49: 0, 50: 0, 51: 0, 52: 0, 53: 0, 54: 0, 55: 0, 56: 0, 57: 0, 58: 0, 59: 0, 60: 0, 61: 0, 62: 0, 63: 0, 64: 0, 65: 0, 66: 0, 67: 0, 68: 0, 69: 0, 70: 0, 71: 0, 72: 0, 73: 0, 74: 0, 75: 0, 76: 0, 77: 0, 78: 0, 79: 0, 80: 0, 81: 0, 82: 0, 83: 0, 84: 0, 85: 0, 86: 0, 87: 0, 88: 0, 89: 0, 90: 0, 91: 0, 92: 0, 93: 0, 94: 0, 95: 0, 96: 0, 97: 0, 98: 0, 99: 0, 100: 0, 101: 0, 102: 0, 103: 0, 104: 0, 105: 0, 106: 0, 107: 0, 108: 0, 109: 0, 110: 0, 111: 0, 112: 0, 113: 0, 114: 0, 115: 0, 116: 0, 117: 0, 118: 0, 119: 0, 120: 0, 121: 0, 122: 0, 123: 0, 124: 0, 125: 0, 126: 0, 127: 0, 128: 0, 129: 0, 130: 0, 131: 0, 132: 0, 133: 0, 134: 0, 135: 0, 136: 0, 137: 0, 138: 

In [168]:
# Feature dimension: 202
max(reward_store,key=reward_store.get)

202

In [37]:
print("RL-based method: f_beta of one-hot encoding: ", reward_store[max(reward_store,key=reward_store.get)])

53.14865948160684
