## Imports (Pandas, Numpy, Spacy, NLTK, Gensim) 

In [1]:
import pandas as pd
import numpy as np
import spacy
spacy.load('en')
from spacy.lang.en import English

import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
en_stops = stopwords.words('english')

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import random
from gensim import corpora
import pickle
import gensim
import pyLDAvis.gensim
import ensembl_rest

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jacky\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Create a dataframe object from given dataset 

In [2]:
#df = pd.read_csv('all_genes.txt', sep='\t', comment='#', low_memory=False, header=0)
#df = pd.read_csv('all_genes.txt')
df1 = pd.read_csv('all_genes.txt', error_bad_lines=False, sep='\t', comment='#', low_memory=False, header=0)
df1.head()

Unnamed: 0,tax_id,Org_name,GeneID,CurrentID,Status,Symbol,Aliases,description,other_designations,map_location,chromosome,genomic_nucleotide_accession.version,start_position_on_the_genomic_accession,end_position_on_the_genomic_accession,orientation,exon_count,OMIM,Unnamed: 17
0,9606,Homo sapiens,10320,0,live,IKZF1,"CVID13, Hs.54452, IK1, IKAROS, LYF1, LyF-1, PP...",IKAROS family zinc finger 1,DNA-binding protein Ikaros|CLL-associated anti...,7p12.2,7,NC_000007.14,50303453.0,50405101.0,plus,15.0,603023,
1,9606,Homo sapiens,84159,0,live,ARID5B,"DESRT, MRF-2, MRF2",AT-rich interaction domain 5B,AT-rich interactive domain-containing protein ...,10q21.2,10,NC_000010.11,61901254.0,62096948.0,plus,12.0,608538,
2,9606,Homo sapiens,1053,0,live,CEBPE,"C/EBP-epsilon, CRP1",CCAAT enhancer binding protein epsilon,CCAAT/enhancer-binding protein epsilon|CCAAT/e...,14q11.2,14,NC_000014.9,23117306.0,23119611.0,minus,2.0,600749,
3,9606,Homo sapiens,1644,0,live,DDC,AADC,dopa decarboxylase,aromatic-L-amino-acid decarboxylase|dopa decar...,7p12.2-p12.1,7,NC_000007.14,50458436.0,50565460.0,minus,18.0,107930,
4,9606,Homo sapiens,7102,0,live,TSPAN7,"A15, CCG-B7, CD231, DXS1692E, MRX58, MXS1, TAL...",tetraspanin 7,tetraspanin-7|CD231 antigen|T-cell acute lymph...,Xp11.4,X,NC_000023.11,38561478.0,38688918.0,plus,8.0,300096,


## Filter Dataframe for relevant data for LDA

In [3]:
df2 = pd.read_csv('all_genes.txt',
                         sep='\t', comment='#', low_memory=False,
                 usecols = ["other_designations","Symbol"],
                         header = 0 )
df2.head()

Unnamed: 0,Symbol,other_designations
0,IKZF1,DNA-binding protein Ikaros|CLL-associated anti...
1,ARID5B,AT-rich interactive domain-containing protein ...
2,CEBPE,CCAAT/enhancer-binding protein epsilon|CCAAT/e...
3,DDC,aromatic-L-amino-acid decarboxylase|dopa decar...
4,TSPAN7,tetraspanin-7|CD231 antigen|T-cell acute lymph...


## LDA (find 5 topics)

In [5]:
parser = English()

# Divide data into atomic elements
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

# Lemmatize words 
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

custom_stop = ['sapiens', 'protein', 'homo', '9606', 'single', 'minus', 'plus', 'homeobox', 'human', 'binding'
              'strand', 'nc_001460.1']
custom_stop.extend(en_stops)

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in custom_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

print(df2)
text_data = []
for i in range(0, len(df2)-1):
    tokens = prepare_text_for_lda(i)
    if random.random() > .99:
        text_data.append(tokens)
            
# create a dictionary from the data
# convert it into a 'bag of words' corpus
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

# define number of topics
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, 
                                           passes=15)
ldamodel.save('model5.gensim')

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

           Symbol                                 other_designations
0           IKZF1  DNA-binding protein Ikaros|CLL-associated anti...
1          ARID5B  AT-rich interactive domain-containing protein ...
2           CEBPE  CCAAT/enhancer-binding protein epsilon|CCAAT/e...
3             DDC  aromatic-L-amino-acid decarboxylase|dopa decar...
4          TSPAN7  tetraspanin-7|CD231 antigen|T-cell acute lymph...
5            LALL                                                NaN
6           NALT1  MIR4674 host gene (non-protein coding)|MIR4674...
7           BLACE                                                NaN
8          CDKN2A  cyclin-dependent kinase inhibitor 2A|CDK4 inhi...
9           PDE4B  cAMP-specific 3',5'-cyclic phosphodiesterase 4...
10           IL15                                     interleukin-15
11           TP63  tumor protein 63|amplified in squamous cell ca...
12          GATA3  trans-acting T-cell-specific transcription fac...
13          NCOA3  nuclear recepto

TypeError: object of type 'int' has no len()

## LDA with 3 topics 

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=50, update_every = 3)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

## LDA with 10 topics

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=10, update_every = 3)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=6)
for topic in topics:
    print(topic)

## Draw the distribution using pyLDAvis  

In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [None]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [None]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)