# MIxS Triad Classification
#### Goal: Classify a GOLD biosample's `env_broad_scale`, `env_local_scale`, and `env_medium`.

In [1]:
import pandas as pd
from catboost import *
from catboost.utils import get_confusion_matrix, get_roc_curve
from sklearn.model_selection import train_test_split

## load biosample data before NER

In [2]:
biosampleDf = pd.read_csv('../../downloads/nmdc-gold-path-ner/nmdc-biosample-table-for-ner-20201016.tsv', sep='\t')
biosampleDf.rename(columns={"GOLD_ID": "gold_id"}, inplace=True) # lower case gold id
biosampleDf.drop_duplicates(inplace=True)
len(biosampleDf)

32236

#### drop rows where either env_broad_scale, env_local_scale, or env_medium are null

In [3]:
biosampleDf = biosampleDf[biosampleDf["ENV_BROAD_SCALE"].notnull()]
biosampleDf = biosampleDf[biosampleDf["ENV_LOCAL_SCALE"].notnull()]
biosampleDf = biosampleDf[biosampleDf["ENV_MEDIUM"].notnull()]

In [4]:
len(biosampleDf)

26846

In [5]:
biosampleDf.head()

Unnamed: 0,gold_id,BIOSAMPLE_NAME,DESCRIPTION,HABITAT,IDENTIFIER,SAMPLE_COLLECTION_SITE,ECOSYSTEM,ECOSYSTEM_CATEGORY,ECOSYSTEM_TYPE,ECOSYSTEM_SUBTYPE,SPECIFIC_ECOSYSTEM,BROAD_SCALE_LABEL,LOCAL_SCALE_LABEL,MEDIUM_LABEL,ENV_BROAD_SCALE,ENV_LOCAL_SCALE,ENV_MEDIUM
0,Gb0173867,Freshwater microbial communities from Amazon R...,Freshwater microbial communities from Amazon R...,Freshwater,RCJ6,river water,Environmental,Aquatic,Freshwater,River,Unclassified,freshwater river biome,river,river water,ENVO_01000253,ENVO_00000022,ENVO_01000599
1,Gb0173872,Freshwater microbial communities from Amazon R...,Freshwater microbial communities from Amazon R...,Freshwater,RCJ3,river water,Environmental,Aquatic,Freshwater,River,Unclassified,freshwater river biome,river,river water,ENVO_01000253,ENVO_00000022,ENVO_01000599
2,Gb0173903,Lake sediment microbial communtites from St. P...,Lake sediment microbial communtites from St. P...,Lake sediment,PH082_579,Lake sediment,Environmental,Aquatic,Freshwater,Lake,Sediment,freshwater lake biome,freshwater lake,lake sediment,ENVO_01000252,ENVO_00000021,ENVO_00000546
3,Gb0173935,Lake sediment microbial communtites from St. P...,Lake sediment microbial communtites from St. P...,Lake sediment,PH-EC31_na,Lake sediment,Environmental,Aquatic,Freshwater,Lake,Sediment,freshwater lake biome,freshwater lake,lake sediment,ENVO_01000252,ENVO_00000021,ENVO_00000546
4,Gb0173942,Freshwater microbial communities from thermoka...,Freshwater microbial communities from thermoka...,Freshwater,,Thermokarst lake,Environmental,Aquatic,Freshwater,Lake,Unclassified,freshwater lake biome,thermokarst lake,lake water,ENVO_01000252,ENVO_03000082,ENVO_04000007


## load runNER output
`runNER` was used to perform NER on biosample_name, decription, habitat, and sample_selection_site fields  
cf. biosample-analysis issue [#47](https://github.com/INCATools/biosample-analysis/issues/47)

In [6]:
nerDf = pd.read_csv('../../downloads/nmdc-gold-path-ner/runner/runNER_Output.tsv', sep='\t')

In [7]:
nerDf.head()

Unnamed: 0,DOCUMENT ID,TYPE,START POSITION,END POSITION,MATCHED TERM,PREFERRED FORM,ENTITY ID,ZONE,SENTENCE ID,ORIGIN,UMLS CUI,SENTENCE
0,Gb0173867,biolink:OntologyClass,0,10,Freshwater,fresh water,ENVO:00002011_SYNONYM,,S1,envo.json,CUI-less,Freshwater microbial communities from Amazon R...
1,Gb0173867,biolink:OntologyClass,45,50,River,river,ENVO:00000022,,S1,envo.json,CUI-less,Freshwater microbial communities from Amazon R...
2,Gb0173867,biolink:OntologyClass,67,77,Freshwater,fresh water,ENVO:00002011_SYNONYM,,S2,envo.json,CUI-less,Freshwater microbial communities from Amazon R...
3,Gb0173867,biolink:OntologyClass,112,117,River,river,ENVO:00000022,,S2,envo.json,CUI-less,Freshwater microbial communities from Amazon R...
4,Gb0173867,biolink:OntologyClass,127,137,Freshwater,fresh water,ENVO:00002011_SYNONYM,,S3,envo.json,CUI-less,Freshwater.


## load one-hot-encoded runNER output

In [8]:
onehotDf = pd.read_csv('../../target/nmdc-biosample-one-hot.tsv', sep='\t')
onehotDf.drop_duplicates(inplace=True)
len(onehotDf)

25015

In [9]:
onehotDf.head()

Unnamed: 0,gold_id,fresh water,river,river water,water,liquid water,saline evaporation pond,lake bed,container of an intermittent saline lake,bayou,...,cave entrance,house,flume,irrigation canal,canalized stream,drainage canal,irrigation ditch,canal,chernozem,Earth
0,Gb0173867,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Gb0173872,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Gb0173903,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,Gb0173935,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,Gb0173942,1,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


## There are fewer rows in the one-hot output than in the biosample data. So, subset the biosample data.
note: We end up with less rows than the one-hot-encoded data b/c we dropped null values from biosample data above.

In [10]:
onehotIds = list(onehotDf["gold_id"])
len(onehotIds)

25015

In [11]:
subsetDf = biosampleDf[biosampleDf["gold_id"].isin(onehotIds)]
subsetDf.drop_duplicates(inplace=True)
len(subsetDf)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


20220

## create target sets

In [12]:
# the sklearn convention is to use y for targets/labels
y_broad = subsetDf[["ENV_BROAD_SCALE"]]
y_local = subsetDf[["ENV_LOCAL_SCALE"]]
y_medium = subsetDf[["ENV_MEDIUM"]]

## create feature set by adding gold path info to one-hot-encoded data

In [13]:
# list of biosample columns to use (note: need to include gold_id for merging)
gold_paths = ["gold_id", "ECOSYSTEM", "ECOSYSTEM_CATEGORY", "ECOSYSTEM_TYPE", "ECOSYSTEM_SUBTYPE", "SPECIFIC_ECOSYSTEM"]

# the sklearn convention is to use X for the feature set
X = pd.merge(onehotDf, subsetDf[gold_paths], how="inner", on="gold_id")
X.pop("gold_id") # drop the gold_id
assert len(X) == len(subsetDf) # verify that the lengths are the same

## classify env_broad_scale

In [14]:
cat_features = ["ECOSYSTEM", "ECOSYSTEM_CATEGORY", "ECOSYSTEM_TYPE", "ECOSYSTEM_SUBTYPE", "SPECIFIC_ECOSYSTEM"]
X_train, X_test, y_train, y_test = train_test_split(X, y_broad, test_size=0.2, random_state=42)

#### run classifier for 10 iterations

In [15]:
model_10 = CatBoostClassifier(iterations=10)

In [16]:
# set logging_level='Silent' to turn of output; set plot=True to see figure
model_10.fit(X_train, y_train, cat_features=cat_features, verbose=False)

<catboost.core.CatBoostClassifier at 0x14889a550>

#### run classifier for 100 iterations

In [133]:
# model_100 = CatBoostClassifier(iterations=100)

In [17]:
# set logging_level='Silent' to turn of output; set plot=True to see figure
# model_100.fit(X_train, y_train, cat_features=cat_features, verbose=False)