In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model

In [None]:
#kmer_size used to name trained model and to name processed data
kmer_size = '4mer'

In [None]:
#load in the data - files are merged along the rep axis, and across the design axis
#ie sort -m HepG2_ScaleUpDesign1_minP_mRNA_Rep1.counts HepG2_ScaleUpDesign2_minP_mRNA_Rep2.counts > HepG2_minP_Rep2.counts

Data_path = "/u/home/m/mudiyang/scratch/Kmer_Regressor_Data"

sequences_path = "%s/Sequences/all.sequences"%Data_path
rna_path       = "%s/RNA/HepG2_minP_Rep1.counts"%Data_path
dna_path       = "%s/DNA/minP_DNA.counts"%Data_path
kmer_path      = "%s/%s.txt"%(Data_path, kmer_size)

sequences = pd.read_csv(sequences_path, header = None, index_col = 0, sep = '\t')
sequences.columns = ['sequence']
rna       = pd.read_csv(rna_path, header = None, index_col = 0, sep = '\t')
dna       = pd.read_csv(dna_path, header = None, index_col = 0, sep = '\t')

In [None]:
#normalize data
#add pesudo count to smooth data
rna = rna+1
dna = dna+1


#get sums of DNA and RNA counts
dna_sum = dna.sum(axis = 0)
rna_sum = rna.sum(axis = 0)


#generate normalized counts. I am including the strange multiply dna_sum by two step, 
#but we can comment that out if we diecide that does not make sense. 
rna_counts = rna/rna_sum

dna_sum = dna_sum*2
dna_counts = dna/dna_sum

In [None]:
#generate scores for each tile. These will be used as the labels for the regressor
tile_scores = np.log2(rna_counts/dna_counts)


#add these scores as a 'label' column to the sequences dataframe
sequences['label'] = tile_scores

#clean heapspace memory
del rna, dna, rna_sum, dna_sum, rna_counts, dna_counts, tile_scores

In [None]:
#generate kmer encoding for each tile
#first, load up the kmers
kmers = pd.read_csv(kmer_path, header = None, index_col = None, sep = ',')
kmers = np.array(kmers).flatten()


#loop though each element of sequences['sequence'] and count isntance of each kmer substring
array = []

for i in range(0,len(sequences['sequence']),1):
    if (i %10000 == 0) or (i == 487136):
        print(i)
    sequence = sequences['sequence'][i]
    array1 = map(sequence.count, kmers)
    array1= list(array1)
    array.append(array1)
    
sequences = sequences.drop(columns=['sequence'])
print("sequence column dropped")

#turn the resulting array into a numpy object and add it to the sequences dataframe
array = np.asarray(array)
print("encodings converted to numpy array")
sequences['encoding'] = array.tolist()
print("encodings added to sequences dataframe")

del array

In [None]:
#pickle the data needed to train regressor with regrossor_trainer
sequences.to_pickle('%s/Processed_Data/%s.pkl'%(Data_path, kmer_size))
del sequences