In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from scipy.stats import norm
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, cross_validate, cross_val_score
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, balanced_accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import RepeatedStratifiedKFold
from mlxtend.classifier import StackingCVClassifier, EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

In [81]:
params = {'n_estimators': [300, 400, 500, 600, 700],
              'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'max_depth': [3, 4, 5, 6],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 2, 3, 4, 5]}

seed = 42
st_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

def calc_scores(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    recall_0, recall_1 = recall_score(y_test, y_pred, pos_label=0), recall_score(y_test, y_pred, pos_label=1)
    precision_0, precision_1 =  precision_score(y_test, y_pred, pos_label=0), precision_score(y_test, y_pred, pos_label=1)
    acc = balanced_accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
    arr = np.array([[acc, precision_0, recall_0, precision_1, recall_1,auc_score]])
    return pd.DataFrame(data=arr, columns=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])

def recall_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def precision_0(y_true, y_pred):
    return precision_score(y_true, y_pred, pos_label=0)

scoring = {"balanced_accuracy": make_scorer(balanced_accuracy_score),
           "recall_0": make_scorer(recall_0), "precision_0": make_scorer(precision_0),
           "recall_1": make_scorer(recall_score), "precision_1": make_scorer(precision_score), "auc": "roc_auc" }

#cross_validation

def print_score_comparison(raw_score, emb_score, target_feature="posOutcome",
                           header_1="Raw Score", header_2="Embedding Score"):
    print("\t\t{0}\n\t\t\t{1}\t\t{2}".format(target_feature, header_1, header_2))
    print("\t\t-------------------------------------------------------")
    print("balanced_accuracy:\t{0:.2%}\t\t\t\t{1:.2%}\n".format(raw_score["balanced_accuracy"].mean(), emb_score["balanced_accuracy"].mean()))
    print("recall_0:\t\t{0:.2%}\t\t\t\t{1:.2%}\n".format(raw_score["recall_0"].mean(), emb_score["recall_0"].mean()))
    print("precision_0:\t\t{0:.2%}\t\t\t\t{1:.2%}\n".format(raw_score["precision_0"].mean(), emb_score["precision_0"].mean()))
    print("recall_1:\t\t{0:.2%}\t\t\t\t{1:.2%}\n".format(raw_score["recall_1"].mean(), emb_score["recall_1"].mean()))
    print("precision_1:\t\t{0:.2%}\t\t\t\t{1:.2%}\n".format(raw_score["precision_1"].mean(), emb_score["precision_1"].mean()))
    print("auc:\t\t\t{0:.2%}\t\t\t\t{1:.2%}\n".format(raw_score["auc"].mean(), emb_score["auc"].mean()))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time

    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

def param_tuning(X, y, n_folds=5, param_comb=25, scoring='roc_auc', jobs=12):
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    rand_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring=scoring, n_jobs=jobs,
                                   cv=skf.split(X, y), verbose=3, random_state=42)

    start_time = timer(None) # timing starts from this point for "start_time" variable
    rand_search.fit(X, y)
    timer(start_time)
    print("Best Score: {:.3%}".format(rand_search.best_score_))
    print(rand_search.best_params_)
    return rand_search

score_cols = ["test_balanced_accuracy","test_precision_0", "test_recall_0",
               "test_precision_1","test_recall_1", "test_auc"]

def get_scores(cv_results, score_keys=None, df_cols=None):
    if score_keys is None:
        score_keys = score_cols
    if df_cols is None:
        score_keys = score_cols
    scores = np.empty([1, len(score_keys)])
    for i, s in enumerate(score_keys):
        scores[0][i] = np.mean(cv_results[s])
    scores_df = pd.DataFrame(data=scores, columns=df_cols)
    return scores_df

def evaluate_embedding(path, outcome_df, target="posOutcome", merge_col="patient_ID", n_jobs=-1):
    emb_df = pd.read_csv(path, sep="\t")
    emb_outcome_df = pd.merge(outcome_df, emb_df, on=merge_col)
    X_emb, y_emb = emb_outcome_df[emb_outcome_df.columns.difference([merge_col, target])], emb_outcome_df[target]
    X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(X_emb, y_emb, test_size=0.3, random_state=seed)
    rand_search_emb = param_tuning(X_train_emb, y_train_emb, jobs=n_jobs)
    params = rand_search_emb.best_params_
    clf_emb = rand_search_emb.best_estimator_
    cv_res = cross_validate(clf_emb, X_train_emb, y_train_emb, scoring=scoring, n_jobs=n_jobs, verbose=1, return_train_score=True,
                            cv=st_cv)
    cv_res_df = get_scores(cv_res)
    clf_emb.fit(X_train_emb, y_train_emb)
    test_scores_df = calc_scores(clf_emb, X_test_emb, y_test_emb)

    return params, cv_res_df, test_scores_df

def load_features(path):
    feats = []
    with open(path, "r") as fp:
        for line in fp.readlines():
            feats.append(line.strip())

    return feats
def evaluate_ge(x_train, y_train, x_test, y_test, outcome_cols=None, feats=None, jobs=-1,
                scoring=scoring, rand_scoring="roc_auc", target="posOutcome"):
    if feats is not None:
        if outcome_cols is not None:
            cols = outcome_cols + feats
        else:
            cols = feats
        x_train = x_train[cols]
        x_test = x_test[cols]
    rand_search = param_tuning(x_train, y_train, scoring=rand_scoring, jobs=jobs)
    params = rand_search.best_params_
    clf = XGBClassifier(**params)
    cv_res = cross_validate(clf, x_train, y_train,scoring=scoring, cv=st_cv, n_jobs=jobs)

    cv_res_df = get_scores(cv_res, score_cols, df_cols=["balanced_accuracy", "recall_0", "precision_0", "recall_1", "precision_1", "auc"])
    clf.fit(x_train, y_train)
    test_scores_df = calc_scores(clf, x_test, y_test)

    return params, clf, cv_res_df, test_scores_df

def discretize_dataset(X, features, bins_labels = None):
    if bins_labels is None:
	    bins_labels = [-1, 0, 1]
    X_disc = X[features]
    bin_dict = {}

    for ft in features:
        r1 = X_disc[ft].mean() - X_disc[ft].std() / 2
        r2 = X_disc[ft].mean() + X_disc[ft].std() / 2
        bin_dict[ft]= [-np.inf, r1, r2, np.inf]
    le = LabelEncoder()

    le.fit(bins_labels)

    for ft in bin_dict:
        X_disc[ft] = le.transform(pd.cut(X_disc[ft], bins=bin_dict[ft], labels=bins_labels))

    ohe = OneHotEncoder(handle_unknown="ignore")
    transformed = ohe.fit_transform(X_disc).toarray()
    X_disc = pd.DataFrame(transformed, columns=ohe.get_feature_names(features))
    return X_disc
from sklearn.metrics import mean_squared_error as rmse
def optimize_k_v1(df, target, exclude=None):
    if exclude is None:
        exclude = ["patient_ID"]
    df = df.drop(exclude, axis=1)
    data = df.to_numpy()
    errors = []
    for k in range(1, 20, 2):
        imputer = KNNImputer(n_neighbors=k)
        imputed = imputer.fit_transform(data)
        df_imputed = pd.DataFrame(imputed, columns=df.columns)

        X = df_imputed.drop(target, axis=1)
        y = df_imputed[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        error = rmse(y_test, preds)
        errors.append({'K': k, 'RMSE': error})

    return errors

from sklearn.metrics import mean_squared_error as rmse
def optimize_k_v2(df, target, exclude=None):
    if exclude is None:
        exclude = ["patient_ID", target]
    X = df.drop(exclude, axis=1)
    data = X.to_numpy()
    errors = []
    for k in range(1, 20, 2):
        imputer = KNNImputer(n_neighbors=k)
        imputed = imputer.fit_transform(data)
        df_imputed = pd.DataFrame(imputed, columns=X.columns)

        y = df[target]
        X_train, X_test, y_train, y_test = train_test_split(df_imputed, y, test_size=0.2, random_state=42)

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        error = rmse(y_test, preds)
        errors.append({'K': k, 'RMSE': error})

    print(errors)
    x = []
    y = []

    for i in errors:
        x.append(i["K"])
        y.append(i["RMSE"])

    x = np.array(x)
    y = np.array(y)
    fig, ax = plt.subplots()
    ax.plot(x, y)
    ax.set_xlabel("K")
    ax.set_ylabel("RMSE")
    ax.set_title("RMSE at d/t K values")
    plt.xticks(np.arange(min(x), max(x) + 1, 2))
    fig.show()

def impute_dataset(df, imputer, target="posOutcome"):
    X = df.drop(["patient_ID", target], axis=1)
    X_new = imputer.fit_transform(X)
    df_imputed = pd.DataFrame(X_new, columns=X.columns)
    p_outcome_df = df[["patient_ID", "posOutcome"]]
    df_imputed = pd.concat([p_outcome_df, df_imputed], axis=1)
    return df_imputed

def one_hot_encode(df, cat_features):
    for i in cat_features:
        df[i] = df[i].astype(dtype=np.int64)
    X_cats = df[cat_features]
    ohe = OneHotEncoder()
    X_ohe = ohe.fit_transform(X_cats).toarray()
    fts_names = ohe.get_feature_names(cat_features)
    ohe_df = pd.DataFrame(X_ohe, columns=fts_names)
    df_encoded = pd.concat([df.drop(cat_features, axis=1), ohe_df], axis=1)
    return df_encoded

def find_diff(df1, df2, index="patient_ID"):
    def highlight_diff(data, color='yellow'):
        attr = 'background-color: {}'.format(color)
        other = data.xs('First', axis='columns', level=-1)
        return pd.DataFrame(np.where(data.ne(other, level=0), attr, ''),
                            index=data.index, columns=data.columns)

    df_all = pd.concat([df1.set_index(index), df2.set_index(index)],
                   axis='columns', keys=['First', 'Second'])
    df_final = df_all.swaplevel(axis='columns')[df1.columns[1:]]
    df_final.style.apply(highlight_diff, axis=None)
    return df_final

In [67]:
#Load the datasets

ge_df = pd.read_csv("datasets/merged-combat15.csv")
state_df = pd.read_csv("datasets/state_and_outcome.csv")

In [68]:
#Feature engineering

state_df = state_df.drop(["series_id", "channel_count", "RFS", "DFS",
                          "pCR", "posOutcome2"], axis=1)

In [69]:
gpl_vals = state_df["gpl"].unique()
print(gpl_vals)
pam_subtypes = state_df["pam_coincide"].unique()
print(pam_subtypes)
p5_types = state_df["p5"].unique()
print(p5_types)
tumor_types = state_df["tumor"].unique()
print(tumor_types)

['GPL570' 'GPL96' 'GPL1708,GPL4133' 'GPL5049' 'GPL1223' 'GPL5325']
['LumB' 'Her2' 'Basal' nan 'LumA' 'Normal']
['k5' 'k3' 'k1' 'k2' 'k4' nan]
['T3' 'T2' 'T4' 'T1' nan 'T0']


In [70]:
state_df = state_df.dropna(axis=0, subset=["pam_coincide", "p5"])
state_df = state_df.reset_index(drop=True)

In [71]:
state_df["tumor"] = state_df["tumor"].astype("category").cat.codes
state_df["pam_coincide"] = pam_code_df = state_df["pam_coincide"].astype("category").cat.codes
state_df["p5"] = p5_code_df = state_df["p5"].astype("category").cat.codes
state_df["gpl"] = gpl_code = state_df["gpl"].astype("category").cat.codes

In [82]:
knn_imputer = KNNImputer(n_neighbors=9)
state_df_v2 = impute_dataset(state_df, knn_imputer)
state_df_v2 = state_df_v2.drop(["gpl"], axis=1)
state_df_v2 = one_hot_encode(state_df_v2, ["pam_coincide", "p5"])
state_df_v2 = state_df_v2.astype(int)

In [84]:
X_st_v2, y_st_v2 = state_df_v2.drop(["posOutcome"], axis=1), state_df_v2["posOutcome"]
X_st_v2_train, X_st_v2_test, y_st_v2_train, y_st_v2_test = train_test_split(X_st_v2, y_st_v2,                                            test_size=0.3, random_state=seed, stratify=y_st_v2)

X_st_v2_train.to_csv("datasets/train_st_knn.csv", index=False)
X_st_v2_test.to_csv("datasets/test_st_knn.csv", index=False)

X_st_v2_train = X_st_v2_train.drop(["patient_ID"], axis=1)
X_st_v2_test = X_st_v2_test.drop(["patient_ID"], axis=1)

In [36]:
params_st_v2, clf_st_v2, cv_scores_st_v2, test_scores_st_v2 = evaluate_ge(
    X_st_v2_train, y_st_v2_train, X_st_v2_test, y_st_v2_test)
cv_scores_st_v2.mean()

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 6.52 seconds.
Best Score: 76.221%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    6.2s finished


balanced_accuracy    0.701849
recall_0             0.700725
precision_0          0.766789
recall_1             0.713096
precision_1          0.636909
auc                  0.762209
dtype: float64

In [37]:
test_scores_st_v2.mean()

balanced_accuracy    0.712924
recall_0             0.707379
precision_0          0.789773
recall_1             0.730909
precision_1          0.636076
auc                  0.785691
dtype: float64

In [41]:
ge_state_outcome_df_v2 = pd.merge(state_df_v2, ge_df, on="patient_ID")
X_st_v2_ge, y_st_v2_ge = ge_state_outcome_df_v2.drop(["patient_ID", "posOutcome"], axis=1), ge_state_outcome_df_v2["posOutcome"]

X_train_st_v2_ge, X_test_st_v2_ge, y_train_st_v2_ge, y_test_st_v2_ge = train_test_split(X_st_v2_ge, y_st_v2_ge, test_size=0.3, stratify=y_st_v2_ge, random_state=seed)
ft_mrmr50 = load_features("datasets/mrmr_ft50.txt")
X_train_st_v2_ge = X_train_st_v2_ge[ft_mrmr50]
X_test_st_v2_ge = X_test_st_v2_ge[ft_mrmr50]

In [42]:

params_st_v2_ge, clf_st_v2_ge, cv_scores_st_v2_ge, test_scores_st_v2_ge = evaluate_ge(
    X_train_st_v2_ge, y_train_st_v2_ge, X_test_st_v2_ge, y_test_st_v2_ge)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 23.56 seconds.
Best Score: 82.483%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.07, 'gamma': 1.5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   22.2s finished


In [43]:
cv_scores_st_v2_ge.mean()

balanced_accuracy    0.760287
recall_0             0.773307
precision_0          0.775363
recall_1             0.749456
precision_1          0.745211
auc                  0.824833
dtype: float64

In [44]:
test_scores_st_v2_ge.mean()

balanced_accuracy    0.767459
recall_0             0.757106
precision_0          0.832386
recall_1             0.790036
precision_1          0.702532
auc                  0.834418
dtype: float64

In [73]:
sim_imputer_1 = SimpleImputer(strategy="constant", fill_value=0)
state_df_v3 = impute_dataset(state_df, sim_imputer_1)
state_df_v3 = state_df_v3.drop(["gpl"], axis=1)
state_df_v3 = one_hot_encode(state_df_v3, ["pam_coincide", "p5"])
state_df_v3 = state_df_v3.astype(int)

In [83]:
X_st_v3, y_st_v3 = state_df_v3.drop(["posOutcome"], axis=1), state_df_v3["posOutcome"]
X_st_v3_train, X_st_v3_test, y_st_v3_train, y_st_v3_test = train_test_split(X_st_v3, y_st_v3,                                            test_size=0.3, random_state=seed, stratify=y_st_v3)

X_st_v3_train.to_csv("datasets/train_st_sim.csv", index=False)
X_st_v3_test.to_csv("datasets/test_st_sim.csv", index=False)

X_st_v3_train = X_st_v3_train.drop(["patient_ID"], axis=1)
X_st_v3_test = X_st_v3_test.drop(["patient_ID"], axis=1)

In [48]:
params_st_v3, clf_st_v3, cv_scores_st_v3, test_scores_st_v3 = evaluate_ge(
    X_st_v3_train, y_st_v3_train, X_st_v3_test, y_st_v3_test)
cv_scores_st_v3.mean()

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 5.5 seconds.
Best Score: 76.270%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.03, 'gamma': 2, 'colsample_bytree': 0.6}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:    5.2s finished


balanced_accuracy    0.703753
recall_0             0.702253
precision_0          0.769235
recall_1             0.715323
precision_1          0.638270
auc                  0.762701
dtype: float64

In [49]:
test_scores_st_v3.mean()


balanced_accuracy    0.702172
recall_0             0.696970
precision_0          0.784091
recall_1             0.720588
precision_1          0.620253
auc                  0.785511
dtype: float64

In [50]:
print_score_comparison(test_scores_st_v2, test_scores_st_v3, header_1="State KNN", header_2="State Simple")

		posOutcome
			State KNN		State Simple
		-------------------------------------------------------
balanced_accuracy:	71.29%				70.22%

recall_0:		70.74%				69.70%

precision_0:		78.98%				78.41%

recall_1:		73.09%				72.06%

precision_1:		63.61%				62.03%

auc:			78.57%				78.55%



In [53]:

ge_state_outcome_df_v3 = pd.merge(state_df_v3, ge_df, on="patient_ID")
X_st_v3_ge, y_st_v3_ge = ge_state_outcome_df_v3.drop(["patient_ID", "posOutcome"], axis=1), ge_state_outcome_df_v3["posOutcome"]

X_train_st_v3_ge, X_test_st_v3_ge, y_train_st_v3_ge, y_test_st_v3_ge = train_test_split(X_st_v3_ge, y_st_v3_ge, test_size=0.3, stratify=y_st_v3_ge, random_state=seed)
ft_mrmr50 = load_features("datasets/mrmr_ft50.txt")
X_train_st_v3_ge = X_train_st_v3_ge[ft_mrmr50]
X_test_st_v3_ge = X_test_st_v3_ge[ft_mrmr50]

In [54]:
params_st_v3_ge, clf_st_v3_ge, cv_scores_st_v3_ge, test_scores_st_v3_ge = evaluate_ge(
    X_train_st_v3_ge, y_train_st_v3_ge, X_test_st_v3_ge, y_test_st_v3_ge)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



 Time taken: 0 hours 0 minutes and 23.25 seconds.
Best Score: 82.483%
{'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.07, 'gamma': 1.5, 'colsample_bytree': 0.6}


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   21.9s finished


In [55]:
cv_scores_st_v3_ge.mean()

balanced_accuracy    0.760287
recall_0             0.773307
precision_0          0.775363
recall_1             0.749456
precision_1          0.745211
auc                  0.824833
dtype: float64

In [56]:
test_scores_st_v3_ge.mean()

balanced_accuracy    0.767459
recall_0             0.757106
precision_0          0.832386
recall_1             0.790036
precision_1          0.702532
auc                  0.834418
dtype: float64

In [74]:
find_diff(state_df_v2, state_df_v3)

Unnamed: 0_level_0,posOutcome,posOutcome,ER,ER,HER2,HER2,PR,PR,node,node,...,p5_0,p5_0,p5_1,p5_1,p5_2,p5_2,p5_3,p5_3,p5_4,p5_4
Unnamed: 0_level_1,First,Second,First,Second,First,Second,First,Second,First,Second,...,First,Second,First,Second,First,Second,First,Second,First,Second
patient_ID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
809184,0,0,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,1
809185,1,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,1
809186,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,1,1,0,0,0,0
809187,0,0,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,1
809188,0,0,1,1,0,0,1,1,1,1,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491199,1,1,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
491270,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,1,1
491200,1,1,0,0,1,1,0,0,1,1,...,0,0,1,1,0,0,0,0,0,0
491201,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
