In [None]:
from random import choice
from string import ascii_uppercase
import math
import time
from swalign import swalign
from scipy.stats import beta
import matplotlib.pyplot as plt
import numpy as np
from scipy.integrate import simps
from numpy import trapz
import random
import operator
import itertools
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import os
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from sklearn.cluster import KMeans


# ****** functions ********

#read n .fna database files in the specified path
#set n = 0 to read all files
def ReadDataBase(_path, n):
    seqList = []
    from os import path
    files = os.listdir(_path) #makes a list of all files in folder
    i = 0
    j = 0
    for f in files:
        for seq_record in SeqIO.parse(_path + f, "fasta"): 
            seqList.append(seq_record.seq) # reads each file into a list
            j += 1
            if(n > 0):
                if(j > n-1):
                    i = n
                    break
        i += 1
        j = 0
        
        if(n > 0):
            if(i > n-1):
                break
                
    return seqList               

#creates dictionary with all permutations of length n 
#with repetition to index Feature Vector
def CreateDictionary(n):
    chars = "ACGT"
    arr = list(itertools.product(chars, repeat=n))
    
    D = {}
    i = 0

    for a in arr:
        D[''.join(a)] = i
        i += 1
        
    return D

#builds the feature vector for sequence using specified indexing dictionary
def FeatureVector(dictionary, sequence, n):    
    sLen = len(sequence)
    arr = [0]*4**n
    i = 0
    
    while(1):
        w = sequence[i:i+n]
        try:
            arr[D[w]] += 1
        except:
            i = i
        i += 1
        if(i+n > sLen):
            break
    
    return arr

#Reads the DB files and puts the information of the file in a array of strings
def readfile(filename):
    temp = open(filename, 'r').read().split('\n')
    return temp
    
    
#returns a random string of specified length
#length: strign length
def randomword(length):
    return (''.join(choice('ACGT') for i in range(0, length)))

#retuns an array of random strings
#size: how many strings there will be in the array
#lakeMinLen: min sequence length
#lakeMaxLen: max sequence length
def lakeString(size, lakeMinLen, lakeMaxLen):     
    lake_water = []
    for i in range(0, size):
        random.seed()
        #generates a random sequence length
        y = random.randint(lakeMinLen, lakeMaxLen)
        
        _str = randomword(y)
        lake_water.append(_str)
    return lake_water

# ******************************************************* main ****************************************************
print("reading viruses...")
known_viruses = ReadDataBase("../database/virus/", 50)
print("reading bacterias...")
known_bacterias = ReadDataBase("../database/bact/", 50)
print("reading lake samples...")
lake = ReadDataBase("../database/lake/", 50)
print("finished reading data")

#matrix with all feature vectors
n = 4
D = CreateDictionary(n)
matrix = []

print("lake: ", len(lake))
for w in lake:
    arr = FeatureVector(D, str(w), n)
    arr = np.divide(np.array(arr), len(w))
    matrix.append(arr)

print("vir: ", len(known_viruses))
for w in known_viruses:
    arr = FeatureVector(D, str(w), n)
    arr = np.divide(np.array(arr), len(w))
    matrix.append(arr)

print("bact: ", len(known_bacterias))
for w in known_bacterias:
    arr = FeatureVector(D, str(w), n)
    arr = np.divide(np.array(arr), len(w))
    matrix.append(arr)
    

    
#### PCA
pca_components = 3
X = np.array(matrix)
# input: samples x features
pca = PCA(n_components=pca_components)
Xhat = pca.fit_transform(X)
print("Percentage of represented variance: ", sum(pca.explained_variance_ratio_))

# print(len(matrix))
# len_lake = len(lake)
# len_viruses = len(known_viruses)
# data = np.transpose(Xhat)
# data1 = data[:, :len_lake]
# data2 = data[:, len_lake:(len_lake+len_viruses)]
# data3 = data[:, (len_lake+len_viruses):]

# # data1 = data[:, :len_viruses]
# # data2 = data[:, len_viruses:]
# # data3 = data[:, len_viruses:]

# print(data1.shape)
# print(data2.shape)
# print(data3.shape)

# ####### Plot results
# if pca_components == 2:
    # plt.plot(data1[0], data1[1], 'go')
    # plt.plot(data2[0], data2[1], 'ro')
    # plt.plot(data3[0], data3[1], 'bo')
# elif pca_components == 3:
    # fig = plt.figure()
    # ax = fig.add_subplot(111, projection='3d')
    # ax.scatter(data1[0], data1[1], data1[2], c='g', depthshade=False)
    # ax.scatter(data2[0], data2[1], data2[2], c='r', depthshade=False)
    # ax.scatter(data3[0], data3[1], data3[2], c='b', depthshade=False)
    
# plt.show()


#### K MEANS CLUSTERING
data = np.array(Xhat)

len_lake = len(lake)
len_viruses = len(known_viruses)

data_in = data[len_lake:,:]
estimator =  KMeans(n_clusters=2)
estimator.fit(data_in)
labels = estimator.labels_

print(labels)
data_in_trans = np.transpose(data_in)

a = 0
b = 0

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for i in range(len(data_in)):
    if i < len_viruses:
        a += 1
        if labels[i] == 0:
            ax.scatter(data_in_trans[0,i], data_in_trans[1,i], data_in_trans[2,i], c='y', marker='o', depthshade=False)
        elif labels[i] == 1:
            ax.scatter(data_in_trans[0,i], data_in_trans[1,i], data_in_trans[2,i], c='y', marker='^', depthshade=False)
    elif i >= len_viruses:
        b += 1
        if labels[i] == 0:
            ax.scatter(data_in_trans[0,i], data_in_trans[1,i], data_in_trans[2,i], c='b', marker='o', depthshade=False)
        elif labels[i] == 1:
            ax.scatter(data_in_trans[0,i], data_in_trans[1,i], data_in_trans[2,i], c='b', marker='^', depthshade=False)

# len_lake = len(lake)
# len_viruses = len(known_viruses)
# data1 = data_in_trans[:, :len_lake]
# data2 = data_in_trans[:, len_lake:(len_lake+len_viruses)]
# data3 = data_in_trans[:, (len_lake+len_viruses):]
# ax.scatter(data1[0], data1[1], data1[2], c='g', marker = 'o', depthshade=False)
# ax.scatter(data2[0], data2[1], data2[2], c='y', marker = 'o', depthshade=False)
# ax.scatter(data3[0], data3[1], data3[2], c='b', marker = 'o', depthshade=False)

print(len_lake, len_viruses)
print("a/b: ", a, b)


half1 = sum(labels[:len_viruses])
half2 = sum(labels[len_viruses:])
if half1 > half2:
    err = len_viruses - half1 + half2
else:
    err = len_viruses - half2 + half1
print("accuracy: ", (100-err), "%")
plt.show()