# Code to demonstrate the 2-stage Classification

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection  import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [3]:
#train_data_f = "parsed_drug.csv"
train_data_f = "parsed_drug_four_cat.csv"
#test_data_f = "test_binary.csv"
test_data_f = "test_four_cat.csv"
df = pd.read_csv(train_data_f, low_memory=False)

In [4]:
#df.head()
#print(df.loc[df['ddi_label'] == 3])

#### Let's learn more about our dataset

In [5]:
print("Number of drug pairs with no affect on each other: ", df.loc[df['ddi_label'] == 0].shape)
print("Number of drug pairs that do affect each other: ", df.loc[df['ddi_label'] >= 1].shape)

Number of drug pairs with no affect on each other:  (20532, 7)
Number of drug pairs that do affect each other:  (3709, 7)


The dataset is very uneven. We might be overfitting

### Create training and development sets

In [6]:
def create_train_dev_test(df):
    random_index = np.random.permutation(df.index)
    df_shuffled = df.ix[random_index, ['drug1', 'drug2', 'sentence_text', 'ddi_label']]
    df_shuffled.reset_index(drop=True, inplace=True)
    rows, columns =  df_shuffled.shape
    train_size = round(rows*.6)
    dev_size   = round(rows*.4)
    df_train = df_shuffled.loc[:train_size]
    df_dev = df_shuffled.loc[train_size:dev_size+train_size].reset_index(drop=True)
    return df_train, df_dev

In [7]:
df_train, df_dev = create_train_dev_test(df);

### Create test set

In [8]:
df_test = pd.read_csv(test_data_f, low_memory=False)
df_test = df_test[['drug1', 'drug2', 'sentence_text', 'ddi_label']]
print("Number of rows in test set", df_test.shape)
print("Number of ddis that do not affect each other", df_test.loc[df_test.ddi_label == 0].shape)

Number of rows in test set (5265, 4)
Number of ddis that do not affect each other (4381, 4)


### Create feature sets

In [15]:
def create_tfidf_tr_dev_test(label, df_train, df_dev, df_test):
    '''
    Creates all featuresets for train, dev, and test.
    @param feature_func the function to apply to the sentences to extract features
    @param label some label to give to this feature method, aids documentation only but
        does not affect the code.
        
        returns arrays for the train, dev, and test featuresets
    '''
    vec = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, max_features=1000)
    arr_train_feature_sparse = vec.fit_transform(df_train['sentence_text'])
    arr_train_feature = arr_train_feature_sparse.toarray()
    arr_train_feature.shape
    
    arr_dev_feature_sparse = vec.transform(df_dev["sentence_text"])
    arr_dev_feature = arr_dev_feature_sparse.toarray()
    arr_dev_feature.shape
    
    arr_test_feature_sparse = vec.transform(df_test["sentence_text"])
    arr_test_feature = arr_test_feature_sparse.toarray()
    arr_test_feature.shape
    
    return arr_train_feature, arr_dev_feature, arr_test_feature

In [9]:
def create_tfidf_tr_dev_test_simple(label):
    '''
    Creates all featuresets for train, dev, and test.
    @param feature_func the function to apply to the sentences to extract features
    @param label some label to give to this feature method, aids documentation only but
        does not affect the code.
        
        returns arrays for the train, dev, and test featuresets
    '''
    vec = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, max_features=1000)
    arr_train_feature_sparse = vec.fit_transform(df_train['sentence_text'])
    arr_train_feature = arr_train_feature_sparse.toarray()
    arr_train_feature.shape
    
    arr_dev_feature_sparse = vec.transform(df_dev["sentence_text"])
    arr_dev_feature = arr_dev_feature_sparse.toarray()
    arr_dev_feature.shape
    
    arr_test_feature_sparse = vec.transform(df_test["sentence_text"])
    arr_test_feature = arr_test_feature_sparse.toarray()
    arr_test_feature.shape
    
    return arr_train_feature, arr_dev_feature, arr_test_feature

In [10]:
def create_tfidf_tr_dev(label, df_train, df_dev):
    '''
    Creates all featuresets for train and dev
    @param feature_func the function to apply to the sentences to extract features
    @param label some label to give to this feature method, aids documentation only but
        does not affect the code.
        
        returns arrays for the train and dev featuresets
    '''
    vec = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, max_features=1000)
    arr_train_feature_sparse = vec.fit_transform(df_train['sentence_text'])
    arr_train_feature = arr_train_feature_sparse.toarray()
    arr_train_feature.shape
    
    arr_dev_feature_sparse = vec.transform(df_dev["sentence_text"])
    arr_dev_feature = arr_dev_feature_sparse.toarray()
    arr_dev_feature.shape
    
    feature_names = vec.get_feature_names()
    
    return arr_train_feature, arr_dev_feature, feature_names

In [11]:
def create_tfidf_test(label, df_test, dev_feature_names):
    '''
    Creates featureset for test
    @param feature_func the function to apply to the sentences to extract features
    @param label some label to give to this feature method, aids documentation only but
        does not affect the code.
        
        returns array for the test featureset
    '''
    vec = TfidfVectorizer(ngram_range=(1, 2), max_df=0.8, max_features=1000,
                         vocabulary=dev_feature_names)
    arr_test_feature_sparse = vec.fit_transform(df_test["sentence_text"])
    arr_test_feature = arr_test_feature_sparse.toarray()
    arr_test_feature.shape
    
    return arr_test_feature

### Using Naive Bayes - single stage classification

In [12]:
print('Creating featuresets...')
tr, dev, test = create_tfidf_tr_dev_test_simple('tfidf')
print('Created featuresets')
print('Applying ML model...')
nb = MultinomialNB()
nb_model_dev = nb.fit(tr, df_train.ddi_label)
nb_pred_dev = nb_model_dev.predict(dev)
print("Applying ML model to test data...")
nb_pred_test = nb_model_dev.predict(test)
from sklearn.metrics import f1_score
# before I used the binary predictions - 0 or 1
print('Accuracy:')
print(accuracy_score(df_dev.ddi_label, nb_pred_dev))
print(f1_score(df_dev.ddi_label, nb_pred_dev, average=None, pos_label = 1))
print('neg (0)', 'pos(1)', '2','3','4')
print(f1_score(df_test.ddi_label, nb_pred_test, average=None, pos_label = 1))

Creating featuresets...
Created featuresets
Applying ML model...
Applying ML model to test data...
Accuracy:
0.727310231023
[ 0.83481639  0.27401575  0.13033175  0.31279621  0.07531381]
neg (0) pos(1) 2 3 4
[ 0.86119874  0.26785714  0.05442177  0.2344086   0.        ]


  'precision', 'predicted', average, warn_for)


### Run Models - 1st stage classification
Develop the dev model here.

In [13]:
#df_train.ddi_label

In [16]:
arr_train_feature, arr_dev_feature, arr_test_feature = create_tfidf_tr_dev_test('tdidf',
                                                            df_train, df_dev, df_test)
nb = MultinomialNB()

nb_model = nb.fit(arr_train_feature, df_train.ddi_label)

In [17]:
nb_predictions = nb_model.predict(arr_dev_feature)

In [18]:
accuracy_score(df_dev.ddi_label, nb_predictions)

0.72731023102310233

In [19]:
nb_predictions

array([3, 3, 4, ..., 0, 0, 4])

In [20]:
print(f1_score(df_dev.ddi_label, nb_predictions,average=None, pos_label = 1))

[ 0.83481639  0.27401575  0.13033175  0.31279621  0.07531381]


#### Model for test data
Use the dev model to predict the ddi_labels in the test set.

In [21]:
nb_predictions_test = nb_model.predict(arr_test_feature)

Save the DDI labels to an intermediary dataset.

In [22]:
interaction_data_test = list(zip(df_test['sentence_text'], nb_predictions_test))
interaction_data_test = [(sent,interaction) for (sent,interaction) in interaction_data_test
                            if interaction >= 1]

In [23]:
print(f1_score(df_test.ddi_label, nb_predictions_test, average=None, pos_label = 1))

[ 0.86119874  0.26785714  0.05442177  0.2344086   0.        ]


  'precision', 'predicted', average, warn_for)


#### Save the positive-interaction results to a dataset

In [24]:
with open('positive_interactions.csv', 'w') as fout:
    fout.write('sentence_text,Category\n')
    for doc, category in zip(df_dev['sentence_text'], nb_predictions):
        if category >= 1:
            fout.write("\"" + doc + '\",' + str(category) + '\n')
interaction_data = list(zip(df_dev['sentence_text'], nb_predictions))
interaction_data = [(sent,interaction) for (sent,interaction) in interaction_data 
                    if interaction >= 1]

#### Redo the pipeline to further classify the positive interaction into the 4 categories (1,2,3,4)

In [25]:
train_data_f_2nd = "positive_interactions.csv"
#df_2nd = pd.read_csv(train_data_f_2nd, low_memory=False, quotechar="\"")
df_2nd = pd.DataFrame(interaction_data, columns=['sentence_text', 'ddi_label'])
df_train_2nd, df_dev_2nd = create_train_dev_test(df_2nd);
arr_train_feature_2nd, arr_dev_feature_2nd, feature_names = create_tfidf_tr_dev('tdidf', 
                                                                df_train_2nd,
                                                                df_dev_2nd)
nb_2nd = MultinomialNB()

nb_model_2nd = nb_2nd.fit(arr_train_feature_2nd, df_train_2nd.ddi_label)
nb_predictions_2nd = nb_model_2nd.predict(arr_dev_feature_2nd)
print(accuracy_score(df_dev_2nd.ddi_label, nb_predictions_2nd))
print(len(df_train_2nd))
print(len(df_dev_2nd))
print(f1_score(df_dev_2nd.ddi_label, nb_predictions_2nd, average=None, pos_label = 1))

0.994805194805
1157
770
[ 0.98373984  0.99285714  0.99714286  1.        ]


#### Need to do 2-stage classification on test data too
I use the dev feature names and the dev model to predict the 2nd stage classification.

In [41]:
#train_data_f_2nd = "positive_interactions.csv"
#df_2nd = pd.read_csv(train_data_f_2nd, low_memory=False, quotechar="\"")
df_test_2nd = pd.DataFrame(interaction_data_test, columns=['sentence_text', 'ddi_label'])

#df_train_2nd, df_dev_2nd = create_train_dev_test(df_2nd);

arr_test_feature_2nd = create_tfidf_test('tdidf', df_test_2nd, feature_names)
#nb_2nd = MultinomialNB()

#nb_model_2nd = nb_2nd.fit(arr_train_feature_2nd, df_train_2nd.ddi_label)
# Using the fitted model from the dev training set:
nb_test_predictions_2nd = nb_model_2nd.predict(arr_test_feature_2nd)
print(accuracy_score(df_test_2nd.ddi_label, nb_test_predictions_2nd))
print(len(df_test_2nd))
print(len(nb_test_predictions_2nd))
from sklearn.metrics import f1_score
print(f1_score(df_test_2nd.ddi_label, 
               nb_test_predictions_2nd, average='macro', pos_label = 1))


0.998701298701
770
0
770
0.998364753276


The f-score is really high because there is very little variation in the training set.

#### Let's create the new dataframe with our predictions! Then I'll change the values of the interactions in two stages. Finally, I'll compute the f-score using the original test set and the new predictions.

In [27]:
prepare_test_for_2nd_pass = list(zip(df_test['sentence_text'], nb_predictions_test))
df_test_prepare_for_2nd = pd.DataFrame(prepare_test_for_2nd_pass, columns=['sentence_text',
                                                                    'ddi_label'])

In [28]:
df_test_after_2nd_pass = df_test_prepare_for_2nd

In [29]:
test_after_2nd_pass = list(zip(df_test_2nd['sentence_text'],nb_test_predictions_2nd))
for sentence, interaction in test_after_2nd_pass:
    df_test_after_2nd_pass.loc[(df_test_after_2nd_pass.sentence_text == sentence),
                              'ddi_label'] = interaction

In [30]:
print(f1_score(df_test.ddi_label, df_test_after_2nd_pass.ddi_label, average=None, pos_label = 1))

[ 0.86119874  0.26268657  0.05442177  0.23415682  0.        ]


  'precision', 'predicted', average, warn_for)


### Summary of results:
The results after the 2-stage classification -- values went down slightly:

1-stage: [ 0.86119874  0.26785714  0.05442177  0.2344086   0.        ]

2-stage: [ 0.86119874  0.26268657  0.05442177  0.23415682  0.        ]

### Calculating the f-score

In [32]:
from sklearn.metrics import f1_score
print(f1_score(df_dev.ddi_label, knn_pred_dev, average=None, pos_label = 1))
#print(f1_score(df_test.ddi_label, knn_pred_test, average=None, pos_label = 1))

[ 0.90536023  0.19588639]
