## OptimAL text clasiffier

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

---
### LabeledIn dataset

In [2]:
data_LabeledIn = pd.read_csv('https://ftp.ncbi.nlm.nih.gov/pub/lu/LabeledIn/Crowdsourcing/Crowdsourced_Results.txt', sep='|', index_col=False, 
                             names=['study_drug_label_ID', 'DailyMed_SPL_ID', 'UMLS_CUI', 'IN_RXCUI','SCDF_RXCUI','SCD_RXCUI','Other_SCDF_RXCUI','Other_SCD_RXCUI','Label'])

In [3]:
#records
len(data_LabeledIn)

3004

In [6]:
data_LabeledIn['Label'].unique()

array(['yes', 'no-not-disease', 'no-contra', 'no-char_risk',
       'no-unrelated', 'uncertain'], dtype=object)

- no-contra (contraindication)  
- 'yes' (indication treatment)

In [4]:
subset = data_LabeledIn[['DailyMed_SPL_ID','UMLS_CUI','Label']]

In [5]:
dataset = pd.read_csv('../data/FinalProduct.csv')

In [6]:
dataset['DailyMed_SPL_ID'] = [x.strip('.xml') for x in dataset['Label ID']]

In [7]:
merged_set = pd.merge(dataset, subset, on='DailyMed_SPL_ID', how = 'inner') #left_on = 'Label ID', right_

In [8]:
merged_set

Unnamed: 0.1,Unnamed: 0,Label ID,Drug Brand Name,Active Ingredient,Context,UNII ID,DBID,Ontology,DOID,From,To,Text,UMLS ID,DailyMed_SPL_ID,UMLS_CUI,Label


MERGE DIDN'T WORK

---
### Original IDS annotated dataset

In [9]:
optimal_dataset = pd.read_csv('../data/OptimALBaselineDataset.csv')
optimal_dataset['text'] = [x.strip('1 INDICATIONS AND USAGE') for x in optimal_dataset['context']]

In [10]:
len(optimal_dataset)

187

In [11]:
opt_subset = optimal_dataset[['text','Worker Answer']]
opt_subset.columns = ['text','label']

In [12]:
optimal_dataset.head()

Unnamed: 0,_unit_id,Worker Answer,context,drug_name,disease_name,text
0,2270472226,effect,(See INDICATIONS AND USAGE and WARNINGS.) Exp...,Nifedipine,CONGESTIVE HEART FAILURE,(See INDICATIONS AND USAGE and WARNINGS.) Exp...
1,2270469148,effect,*Early clinical studies and incidence rates fr...,Allopurinol,GOUT,*Early clinical studies and incidence rates fr...
2,2270472289,effect,*Early clinical studies and incidence rates fr...,Allopurinol,GOUT,*Early clinical studies and incidence rates fr...
3,2259533372,indication_treatment,1 INDICATIONS AND USAGE\n\n 1.1 Hypertension \...,Perindopril,hypertension,\n\n 1.1 Hypertension \nACEON is indicated for...
4,2259533358,indication_treatment,1 INDICATIONS AND USAGE\n\n 1.1 Hypertension \...,Valsartan,hypertension,\n\n 1.1 Hypertension \nDiovaní‰Œ¬ (valsartan)...


---
### Dataset with Context and labels merged from LabelIN

In [9]:
dataset_merged = pd.read_csv('../data/LabeledIn_with_context.csv')
dataset_merged['text'] = [x.strip('INDICATIONS AND USAGE') for x in dataset_merged['Text']]

In [10]:
len(dataset_merged)

1331

In [11]:
dataset_merged = dataset_merged[['text','Crowdsourced_answer']]
dataset_merged.columns = ['text','label']

In [12]:
dataset_merged['label'].unique()

array(['yes', 'no-char_risk', 'no-not-disease', 'no-contra',
       'no-unrelated', 'uncertain'], dtype=object)

In [14]:
dataset_merged.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
no-char_risk,117
no-contra,71
no-not-disease,104
no-unrelated,33
uncertain,8
yes,998


In [17]:
#dataset_merged[(dataset_merged['label']=='yes') | (dataset_merged['label']=='no-contra')]

---
## Text Classifier

In [15]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string

In [16]:
df = dataset_merged

In [17]:
df.head()

Unnamed: 0,text,label
0,Prednisolone Sodium Phosphate Oral Solution (1...,yes
1,Prednisolone Sodium Phosphate Oral Solution (1...,yes
2,Prednisolone Sodium Phosphate Oral Solution (1...,yes
3,Prednisolone Sodium Phosphate Oral Solution (1...,yes
4,Prednisolone Sodium Phosphate Oral Solution (1...,yes


In [27]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text'], df['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

In [28]:
#train_y

In [29]:
def train_model(classifier, feature_vector_train, label, feature_vector_test):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_test)
    return metrics.accuracy_score(predictions, test_y)

--- 
### Text transformation models

In [30]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['text'])
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xtest_count =  count_vect.transform(test_x)

In [31]:
#for i in xtrain_count:
#    print(i)

In [32]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

In [33]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xtest_count)
print ("RF, Count Vectors: {}".format(accuracy))

RF, Count Vectors: 0.7147147147147147


In [34]:
accuracy

0.7147147147147147

In [27]:
# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xtest_tfidf)
print ("RF, TF-IDF: {}".format(accuracy))

RF, TF-IDF: 0.7387387387387387
