In [1]:
import os
from Bio import SeqIO
from Bio.Seq import Seq

os.chdir('/home/ubuntu/data/')

intr_file = 'hg19_intr_clean.fa'
depl_file = 'hg19_depl_clean.fa'

e = 0
intr_seqs = []
depl_seqs = []
for intr, depl in zip(SeqIO.parse(intr_file, 'fasta'), SeqIO.parse(depl_file, 'fasta')):
    
    cutoff = 500
    my_intr_seq = str(intr.seq)[0:cutoff]
    my_depl_seq = str(depl.seq)[0:cutoff]
    
    intr_seqs.append(my_intr_seq)    
    depl_seqs.append(my_depl_seq)

    e = e + 1
    if e%10000 == 0:
        print('Finished ' + str(e) + ' entries')

Finished 10000 entries
Finished 20000 entries
Finished 30000 entries
Finished 40000 entries
Finished 50000 entries
Finished 60000 entries
Finished 70000 entries


In [2]:
sequences = intr_seqs + depl_seqs
len(sequences)

146728

In [3]:
def getKmers(sequence, size):
    return [sequence[x:x+size].upper() for x in range(len(sequence) - size + 1)]

In [4]:
print('Building Neanderthal introgressed sequences')
intr_sentences = []
for i in range(len(intr_seqs)):
    intr_sentences.append(getKmers(intr_seqs[i], 10))

print('Building Neanderthal depleted sequences')
depl_sentences = []
for i in range(len(depl_seqs)):
    depl_sentences.append(getKmers(depl_seqs[i], 10))

print('Building merged Neanderthal introgressed and depleted sequences')
sentences = []
for i in range(len(sequences)):
    sentences.append(getKmers(sequences[i], 10))

Building Neanderthal introgressed sequences
Building Neanderthal depleted sequences
Building merged Neanderthal introgressed and depleted sequences


In [5]:
import warnings
warnings.filterwarnings('ignore')

from gensim.models import Word2Vec
model = Word2Vec(sentences, min_count = 2, workers = 4)
print(model)

KeyboardInterrupt: 

In [None]:
X = model[model.wv.vocab]
X.shape

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
X = model[model.wv.vocab]
pca = PCA(n_components = 2)
result = pca.fit_transform(X)

plt.figure(figsize = (20,18))
plt.scatter(result[:, 0], result[:, 1], s = 10, cmap = 'tab10')
plt.title('Principal Components Analysis (PCA): All K-mers', fontsize = 20)
plt.xlabel("PC1", fontsize = 20)
plt.ylabel("PC2", fontsize = 20)
words = list(model.wv.vocab)
for i, word in enumerate(words):
    if word == 'AAAAA':
        plt.text(result[i, 0], result[i, 1], word, fontsize = 30, c = 'green')
    elif word == 'CAAAA':
        plt.text(result[i, 0], result[i, 1], word, fontsize = 30, c = 'green')
    elif word == 'CATTT':
        plt.text(result[i, 0], result[i, 1], word, fontsize = 30, c = 'green')
    elif word == 'TTTTT':
        plt.text(result[i, 0], result[i, 1], word, fontsize = 30, c = 'green')
    else:
        plt.text(result[i, 0], result[i, 1], word, fontsize = 10, c = 'red')
plt.show()

In [None]:
from sklearn.manifold import TSNE

plt.figure(figsize=(20, 15))
X_reduced = PCA(n_components = 5).fit_transform(X)
tsne_model = TSNE(learning_rate = 500, n_components = 2, random_state = 123, perplexity = 30)
tsne = tsne_model.fit_transform(X_reduced)
plt.scatter(tsne[:, 0], tsne[:, 1], cmap = 'tab10', s = 10)
plt.title('tSNE on PCA: All K-mers', fontsize = 20)
plt.xlabel("tSNE1", fontsize = 20)
plt.ylabel("tSNE2", fontsize = 20)
words = list(model.wv.vocab)
for i, word in enumerate(words):
    if word == 'AAAAA':
        plt.text(tsne[i, 0], tsne[i, 1], word, fontsize = 30, c = 'green')
    elif word == 'CAAAA':
        plt.text(tsne[i, 0], tsne[i, 1], word, fontsize = 30, c = 'green')
    elif word == 'CATTT':
        plt.text(tsne[i, 0], tsne[i, 1], word, fontsize = 30, c = 'green')
    elif word == 'TTTTT':
        plt.text(tsne[i, 0], tsne[i, 1], word, fontsize = 30, c = 'green')
    else:
        plt.text(tsne[i, 0], tsne[i, 1], word, fontsize = 10, c = 'red')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

X_intr = model_intr[model_intr.wv.vocab]
pca_intr = PCA(n_components = 2)
result_intr = pca_intr.fit_transform(X_intr)

X_depl = model_depl[model_depl.wv.vocab]
pca_depl = PCA(n_components = 2)
result_depl = pca_depl.fit_transform(X_depl)

plt.figure(figsize = (20,18))
plt.scatter(result_intr[:, 0], result_intr[:, 1], s = 10, cmap = 'tab10')
plt.scatter(result_depl[:, 0], result_depl[:, 1], s = 10, cmap = 'tab10')
plt.title('Principal Components Analysis (PCA): Neanderthal introgressed K-mers', fontsize = 20)
plt.xlabel("PC1", fontsize = 20)
plt.ylabel("PC2", fontsize = 20)
words_intr = list(model_intr.wv.vocab)
words_depl = list(model_depl.wv.vocab)
for i_intr, word_intr in enumerate(words_intr):
    plt.text(result_intr[i_intr, 0], result_intr[i_intr, 1], word_intr, fontsize = 5, c = 'red')
for i_depl, word_depl in enumerate(words_depl):
    plt.text(result_depl[i_depl, 0], result_depl[i_depl, 1], word_depl, fontsize = 5, c = 'blue')
plt.show()