# Final Project
## Nicolas A Gort Freitas
## Ontological classification of genetic mutations



## Introduction

asdasdasdas

### Dependencies

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline

from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords

pd.set_option('max_colwidth',75)

### Latent Dirichlet Allocation
Auxiliary functions

In [None]:
def count_matrix(words):
    vec = CountVectorizer(max_df=100, min_df=1, stop_words='english', max_features=50)
    X = vec.fit_transform([" ".join(w) for w in words])
    vocab = vec.get_feature_names()

    count_matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
    
    return count_matrix

def topic_identifier(words_from_chapters, n_topics=10,top_topics=10, dictionary = False):
    vec = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    X = vec.fit_transform([" ".join(w) for w in words_from_chapters])
    vocab = vec.get_feature_names()

    count_matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

    lda = LatentDirichletAllocation(n_components=n_topics)
    lda.fit(count_matrix)

    topic_words = {}

        
    for topic, comp in enumerate(lda.components_):   
        word_idx = np.argsort(comp)[::-1][:top_topics]

        # store the words most relevant to the topic
        topic_words[topic] = [vocab[i] for i in word_idx]

    
    if dictionary:
            return topic_words
        
    for topic, words in topic_words.items():
        print('Topic: %d' % topic)
        print('  %s' % ', '.join(words))
        
    

### Importing data 

In [4]:
training_variants = pd.read_csv('training_variants', skiprows = 1, names=['ID','Gene','Variation','Class'])#.set_index('ID')
training_text = pd.read_csv('training_text', sep = '\|\|', header = None, skiprows = 1, 
                            names = ['ID', 'Text'], engine = 'python', encoding = 'utf-8').set_index('ID')

training_joint = training_variants.set_index('ID').join(training_text, how='right')

y = training_variants.loc[:,['ID','Class']].set_index('ID')
training_variants = training_variants.loc[:,['ID','Gene','Variation']].set_index('ID')

In [5]:
X_train = training_variants.join(training_text, how='right')

# What is each class related to?

# Supervised topic modeling through Latent Dirichlet Allocation

### Preprocessing: Deleting rows without text

In [8]:
training_text = training_text[training_text.Text.isna() == False]
training_joint = training_joint[training_joint.Text.isna() == False]

In [7]:

#X_train.loc[:,['ID','Gene','Variation','']

### Attempt 1: Classes as documents

In [29]:
text_by_class = training_joint.groupby('Class').agg(lambda x : ' '.join(x).split(' ')).Text

In [69]:
topic_identifier(text_by_class, n_topics=9, top_topics=10)



Topic: 0
  vus, brca2, brct, odds, erbb2, vuss, smad2, mlh1, bard1, causality
Topic: 1
  alk, imatinib, gefitinib, brca2, tsc2, erbb2, vus, brct, smad3, d1
Topic: 2
  alk, brct, imatinib, gefitinib, brca2, ar, f3, ba, jak2, tsc2
Topic: 3
  tsc2, smad3, brct, smad2, brca2, sf3b1, idh1, tsc1, tgf, vhl
Topic: 4
  alk, imatinib, vus, brca2, gefitinib, brct, ar, d1, spop, smad2
Topic: 5
  imatinib, alk, gefitinib, brct, tsc2, nrf2, d1, ar, smad3, brca2
Topic: 6
  alk, imatinib, gefitinib, d1, nrf2, ba, f3, ar, jak2, erlotinib
Topic: 7
  imatinib, alk, gefitinib, ros1, d1, smo, erbb2, ret, fgfr3, gist
Topic: 8
  spop, p16ink4a, brct, tsc2, nf1, tsc1, smad3, p16, vhl, ar


Judging by these results, which include multiple repetitions across topics, we cannot use LDA to learn about the topic of each class.

### Attempt 2:  Genes as documents with one topic given each class

In [51]:
documents_per_class = ['']*9
for clss in range(1,10):
    documents_per_class[clss-1] = training_joint.query('@training_joint.Class == {}'.format(clss)).groupby('Gene').agg(lambda x : ' '.join(x).split(' ')).Text


In [80]:
for clss in range(1,10):
    print('Class {} '.format(clss),topic_identifier(documents_per_class[clss], n_topics=1, top_topics=10))



Topic: 0
  p53, fig, et, al, tsc2, binding, wild, cancer, mutants, tumor
Class 1  None




Topic: 0
  mutations, patients, mutation, et, al, figure, fig, kinase, egfr, tumor
Class 2  None




Topic: 0
  brca1, al, variants, et, cancer, alk, mtor, mutants, assay, domain
Class 3  None




Topic: 0
  pten, et, al, activity, fig, brca1, variants, cancer, binding, p53
Class 4  None




Topic: 0
  brca1, variants, functional, fig, vus, assays, domain, neutral, figure, binding
Class 5  None




Topic: 0
  brca1, variants, cells, fig, deleterious, binding, activity, dna, domain, kinase
Class 6  None




Topic: 0
  patients, fig, egfr, et, al, tumor, kit, domain, tumors, ras
Class 7  None
Topic: 0




  al, et, figure, fig, genes, idh1, mutation, akt, h3, samples
Class 8  None
Topic: 0
  mutations, mutant, sf3b1, figure, splicing, et, al, idh1, genes, 2hg
Class 9  None




Conclusions. There is not much 

Classes:
* 1: 