### What is this?
What are you doing in this notebook

In [201]:
import pandas as pd
import turicreate as tc
import numpy
from pycm import *
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

Using TensorFlow backend.
  EagerTensor = c_api.TFE_Py_InitEagerTensor(_EagerTensorBase)
  if d.decorator_argspec is not None), _inspect.getargspec(target))


### Dataset with annotated classes to train the model

In [16]:
input_ncbi = pd.read_csv('../datasets/NCBIfullset.tsv', sep = '\t', header = None)
input_ncbi.columns = ['pubmed_id','value','class','id_ontology']

In [28]:
table_all_diseases = input_ncbi[~input_ncbi['value'].isnull()]

#### Creating a dataset for diseases 
*Example*

In [9]:
table_all_diseases.head()

Unnamed: 0,pubmed_id,value,class,id_ontology
2,10192393,skin tumour,DiseaseClass,D012878
3,10192393,cancer,DiseaseClass,D009369
4,10192393,colon cancers,DiseaseClass,D003110
5,10192393,adenomatous polyposis coli,SpecificDisease,D011125
6,10192393,APC,SpecificDisease,D011125


In [13]:
input_ncbi.head()

Unnamed: 0,pubmed_id,value,class,id_ontology
0,10192393|t|A common human skin tumour is cause...,,,
1,10192393|a|WNT signalling orchestrates a numbe...,,,
2,10192393,skin tumour,DiseaseClass,D012878
3,10192393,cancer,DiseaseClass,D009369
4,10192393,colon cancers,DiseaseClass,D003110


#### Creating a dataset for titles and abstracts

In [40]:
raw = input_ncbi[~input_ncbi['pubmed_id'].isnull()]
vec_raw = raw[raw['value'].isnull()]['pubmed_id']
vec_split = []
for line in vec_raw:
    vec_split.append(line.split("|"))

In [43]:
corpus = pd.DataFrame(vec_split)
corpus.columns = ['pubmed_id','type','text']

In [48]:
df_titles = corpus[corpus['type'] == 't'][['pubmed_id','text']]
df_abstracts = corpus[corpus['type'] == 'a'][['pubmed_id','text']]

In [57]:
#pd.pivot_table(corpus, values=None, index='pubmed_id', columns='type')

#### Creating unique classes

If the same disease name is annotated to more than one ontology classes - then a new synthetic class is assigned which is the one which occurs more number of times (Consensus Approach)

In [170]:
def create_class_table(table_all_diseases):
    df_diseases = pd.pivot_table(table_all_diseases, values = 'pubmed_id', index='value', columns='class', aggfunc= 'count').reset_index()
    df_diseases['max_value'] = df_diseases[["CompositeMention", "DiseaseClass", "Modifier","SpecificDisease"]].max(axis=1)
    matching = []
    for index, row in df_diseases.iterrows():
        if (row['CompositeMention']==row['max_value']):
            matching.append('CompositeMention')
        elif (row['DiseaseClass']==row['max_value']):
            matching.append('DiseaseClass')
        elif (row['Modifier']==row['max_value']):
            matching.append('Modifer')
        else:
            matching.append('SpecificDisease')
    df_diseases['synthetic_class'] = matching
    return df_diseases

In [171]:
df_diseases = create_class_table(table_all_diseases)

In [238]:
grouped = df_diseases.groupby('synthetic_class').count()['value']

In [240]:
grouped/grouped.sum()

synthetic_class
CompositeMention    0.052434
DiseaseClass        0.267322
Modifer             0.103933
SpecificDisease     0.576311
Name: value, dtype: float64

Now that we have the dataframe, with classes let's construct features from the corpus

### Feature generation using topic modeling
#### Warning: Does not work in windows without Docker

In [243]:
#sf_diseases = tc.SFrame(data=df_diseases[['value','synthetic_class']])
sf_diseases = tc.SFrame(data=df_diseases[df_diseases['synthetic_class'] != 'CompositeMention'][['value','synthetic_class']])

In [244]:
disease_classes = [i for i in sf_diseases['synthetic_class'].unique()]

In [245]:
print('The diseases classes are: {}'.format(disease_classes))

The diseases classes are: ['Modifer', 'SpecificDisease', 'DiseaseClass']


In [247]:
#sf_diseases

#### Create Feature using topic modeling

In [248]:
def create_features(category_df, n_features):
    # Remove stopwords and convert to bag of words
    doc = tc.text_analytics.count_words(category_df['value'])
    doc = doc.dict_trim_by_keys(tc.text_analytics.stopwords(), True)
    
    # Learn topic model
    model = tc.topic_model.create(doc, verbose=False)
    # Agreaggate the unique words
    sf_topics = model.get_topics()
    #append the topics from abstract text
    
#    sf_topics = sf_topics.append(text_topics)
    sf_words = sf_topics.groupby(key_column_names='word', operations={'sum_scores': tc.aggregate.SUM('score')})
    
    # Sort the features scores and filter out all those which are key
    sf_words = sf_words.sort('sum_scores', ascending= False).filter_by(disease_classes, 'word', exclude=True)
    
    # Take a look of the features related with this key class
    features = [i for i in sf_words['word']][0:n_features] #changable
    return sf_words, features

In [249]:
#sf_diseases

In [250]:
sf_words, features = create_features(sf_diseases, 40)

### Make input matrix

In [251]:
def get_input_matrix(features, category_df, sf_words):
    tuples = []
    for word in features:
        feature_vector = [1 if (word in i) else 0 for i in category_df['value']]
        tuples.append((word, feature_vector))
        
    sf_features = tc.SFrame({key: value for (key, value) in tuples})
    #concatenating the features with the category matrix
    category_df = category_df.add_row_number()
    sf_features = sf_features.add_row_number()
    final_table = category_df.join(sf_features, on='id', how='left')
    for f in features:
        score = sf_words[sf_words['word'] == str(f)]['sum_scores'].astype(float)[0]
        final_table[str(f)] = [(1.0+score) * i for i in final_table[str(f)]]
        
    return final_table

In [252]:
input_matrix = get_input_matrix(features, sf_diseases, sf_words)

In [253]:
input_matrix = input_matrix.to_dataframe()

## Multicalss classification using keras

In [254]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

In [255]:
epochs=300
batch_size=2

In [256]:
# load dataset
#dataframe = pandas.read_csv("feature_matrix_text_topics.csv")
dataset = input_matrix.values

In [257]:
X = dataset[:,3:].astype(float) #features
Y = dataset[:,2] #target  

In [258]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [259]:
X.shape[1]

38

In [270]:
list(set(Y))

['SpecificDisease', 'DiseaseClass', 'Modifer']

In [260]:
#dummy_y

In [261]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=39, activation='relu')) #dimension is number of features
    model.add(Dense(3, activation='sigmoid')) #no of classes
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
    return model

In [277]:
# define baseline model
def second_model():
    # create model
    model = Sequential()
    #dimension is number of features
    model.add(Dense(20, input_dim=X.shape[1], activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(len(list(set(Y))), activation='sigmoid')) #no of classes
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
    return model

In [278]:
#estimator = KerasClassifier(build_fn=second_model, epochs=10, batch_size=5, verbose=0) #baseline

In [279]:
estimator = KerasClassifier(build_fn=second_model, epochs=epochs, batch_size=batch_size, verbose=0)

In [280]:
#kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [281]:
#results = cross_val_score(estimator, X, dummy_y, cv=kfold)
#print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [282]:
estimator.fit(X,dummy_y)

  if d.decorator_argspec is not None), _inspect.getargspec(target))


<keras.callbacks.History at 0x1a1c35f898>

In [283]:
y_predict = estimator.predict(X, batch_size=2, verbose=1, steps=None) 



In [284]:
cm = ConfusionMatrix(actual_vector=encoded_Y, predict_vector=y_predict) # Create CM From Data
cm.classes

[0, 1, 2]

In [285]:
cm.table
print(cm)

Predict          0        1        2        
Actual
0                219      0        352      
1                0        0        222      
2                38       0        1193     




Overall Statistics : 

95% CI                                                           (0.67762,0.71764)
Bennett_S                                                        0.54644
Chi-Squared                                                      None
Chi-Squared DF                                                   4
Conditional Entropy                                              0.39184
Cramer_V                                                         None
Cross Entropy                                                    None
Gwet_AC1                                                         0.62046
Joint Entropy                                                    1.69293
KL Divergence                                                    None
Kappa                                                           