In [None]:
import pandas as pd

### Reading the annotated file


In [None]:
input_ncbi = pd.read_csv('NCBIfullset.tsv', sep = '\t', header = None)
input_ncbi.columns = ['pubmed_id','value','class','id_ontology']

In [None]:
input_ncbi

In [None]:
table_all_diseases = input_ncbi[~input_ncbi['value'].isnull()]
table_all_diseases

In [None]:
vec_raw = data[data['value'].isnull()]['pubmed_id']
vec_split = []
for line in vec_raw:
    vec_split.append(line.split("|"))

In [None]:
corpus = pd.DataFrame(vec_split)
corpus.columns = ['pubmed_id','type','text']

In [None]:
df_titles = corpus[corpus['type'] == 't'][['pubmed_id','text']]
df_abstracts = corpus[corpus['type'] == 'a'][['pubmed_id','text']]

### Consensus Approach

If the same disease name is annotated to more than one ontology classes - then a new synthetic class is assigned which is the one which occurs more number of times

In [None]:
table = pd.pivot_table(table_all_diseases, values = 'pubmed_id', index='value', columns='class', aggfunc= 'count')
table

In [None]:
table = table.reset_index()
#for new synthetic class
table["synthetic_class"] = ""

In [None]:
ops = pd.DataFrame({"CompositeMention": table2['CompositeMention'], "DiseaseClass":  table2['DiseaseClass'], 
                      "Modifier": table2['Modifier'], "SpecificDisease": table2['SpecificDisease']})

In [None]:
ops["new_class"] = ""
ops["max_max"] = ops[["CompositeMention", "DiseaseClass", "Modifier","SpecificDisease"]].max(axis=1)

In [None]:
lis = []
for index, row in ops.iterrows():
    if (row['CompositeMention']==row['max_max']):
        lis.append('CompositeMention')
    elif (row['DiseaseClass']==row['max_max']):
        lis.append('DiseaseClass')
    elif (row['Modifier']==row['max_max']):
        lis.append('Modifer')
    else:
        lis.append('SpecificDisease')

In [None]:
#append the synthetic class
table["synthetic_class"] = lis

In [None]:
#export to csv
table.to_csv('file name')

### Feature generation using topic modeling
#### Warning: Does not work in windows without Docker

In [None]:
import turicreate as tc

In [None]:
#Load the data
data = tc.SFrame('diseases_all_consensus.csv')

In [None]:
data[0]

In [None]:
disease_classes = [i for i in data['synthetic_class'].unique()]
disease_classes

#### Create Feature using topic modeling

In [None]:
# Remove stopwords and convert to bag of words
    doc = tc.text_analytics.count_words(data['value'])
    doc = doc.dict_trim_by_keys(tc.text_analytics.stop_words(), True)

In [None]:
# Learn topic model
    model = tc.topic_model.create(doc, initial_topics=text_topics['word'], verbose=False)
    # Agreaggate the unique words
    sf_topics = model.get_topics()
    sf_topics = sf_topics.append(text_topics)
    sf_words = sf_topics.groupby(key_column_names='word', operations={'sum_scores': tc.aggregate.SUM('score')})
    
    # Sort the features scores and filter out all those which are key
    sf_words = sf_words.sort('sum_scores', ascending= False).filter_by(disease_classes, 'word', exclude=True)

In [None]:
sf_words.print_rows(50,2)

In [None]:
def create_features(category_df, n_features):
    # Remove stopwords and convert to bag of words
    doc = tc.text_analytics.count_words(data['value'])
    doc = doc.dict_trim_by_keys(tc.text_analytics.stop_words(), True)
    
    # Learn topic model
    model = tc.topic_model.create(doc, initial_topics=text_topics['word'], verbose=False)
    # Agreaggate the unique words
    sf_topics = model.get_topics()
    #append the topics from abstract text
    
    sf_topics = sf_topics.append(text_topics)
    sf_words = sf_topics.groupby(key_column_names='word', operations={'sum_scores': tc.aggregate.SUM('score')})
    
    # Sort the features scores and filter out all those which are key
    sf_words = sf_words.sort('sum_scores', ascending= False).filter_by(disease_classes, 'word', exclude=True)
    
    # Take a look of the features related with this key class
    features = [i for i in sf_words['word']][0:n_features] #changable
    return sf_words, features

In [None]:
sf_words, features = create_features(data['value'], len(sf_words))

### Make input matrix

In [None]:
def get_input_matrix(features, category_df, sf_words):
    tuples = []
    for word in features:
        feature_vector = [1 if (word in i) else 0 for i in data['value']]
        tuples.append((word, feature_vector))
        
    sf_features = tc.SFrame({key: value for (key, value) in tuples})
    #concatenating the features with the category matrix
    category_df = category_df.add_row_number()
    sf_features = sf_features.add_row_number()
    final_table = category_df.join(sf_features, on='id', how='left')
    for f in features:
        score = sf_words[sf_words['word'] == str(f)]['sum_scores'].astype(float)[0]
        final_table[str(f)] = [(1.0+score) * i for i in final_table[str(f)]]
        
    return final_table

In [None]:
input_matrix = get_input_matrix(features, data, sf_words)
input_matrix

In [None]:
input_matrix.export_csv('feature_matrix_.csv')

## Keras model

In [None]:
import numpy
import pandas
from pycm import *
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [None]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

In [None]:
epochs=300
batch_size=2

In [None]:
# load dataset
dataframe = pandas.read_csv("feature_matrix_text_topics.csv")
dataset = dataframe.values
X = dataset[:,2:80].astype(float) #features
Y = dataset[:,1] #target  

In [None]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [None]:
dummy_y

In [None]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=39, activation='relu')) #dimension is number of features
    model.add(Dense(3, activation='sigmoid')) #no of classes
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
    return model

In [None]:
# define baseline model
def second_model():
    # create model
    model = Sequential()
    #dimension is number of features
    model.add(Dense(20, input_dim=78, activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4, activation='sigmoid')) #no of classes
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
    return model

In [None]:
#estimator = KerasClassifier(build_fn=second_model, epochs=10, batch_size=5, verbose=0) #baseline

In [None]:
estimator = KerasClassifier(build_fn=second_model, epochs=epochs, batch_size=batch_size, verbose=0)

In [None]:
#kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [None]:
#results = cross_val_score(estimator, X, dummy_y, cv=kfold)
#print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
estimator.fit(X,dummy_y)

In [None]:
y_predict = estimator.predict(X, batch_size=2, verbose=1, steps=None) 

In [None]:
cm = ConfusionMatrix(actual_vector=encoded_Y, predict_vector=y_predict) # Create CM From Data
cm.classes

In [None]:
cm.table
print(cm)