# Plan
Want to generate jackknife model data. 

## Parameters/Inputs/Outputs
input:
    1. Input henry's csv file with bacteria
    2. Output file name
    3. % in training set
    
output:
    1. Dictionary (or file) with training dataset + validation dataset
    
## How to accomplish it
Seperate into isotype buckets. 
Then make liberal use of random.sample()
    * if no items in bucket pick None
    * if one item in bucket 50% chance of picking
    * if 80% cannot be chosen (because there are less than 5 options)
    pick floor(.8*num)
    * if 80% can be chosen just draw 80%


In [40]:
# user inputs
testPercent = 0.8
randomSeed = 11
inputFileName = 'henry_improved_curatedBacteria.csv'
outputFileName = 'lactococcus_jackknife_set.csv'

In [20]:
import pandas as pd
import numpy as np
import functools
import random
import math

random.seed(randomSeed)

def conjunction(*conditions):
    return functools.reduce(np.logical_and, conditions)
def orLogical(*conditions):
    return functools.reduce(np.logical_or, conditions)

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#df = pd.read_csv('bacteria_with_position_46_m7G.csv', delimiter=',')
df = pd.read_csv(inputFileName, delimiter=',')

# Lets make our buckets

In [10]:
# So now we have total bins of isotypes, lets get bins across all species
# for amounts of isotypes positive for m7G at position 46
isotypeNames = ["Ini",'Ala','Arg','Asn','Asp','Cys', 'Gln', 'Glu', 'Gly',
 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser',
 'Thr', 'Trp', 'Tyr', 'Val']

fullDataSet = {isotype:list() for isotype in isotypeNames}

testDataSet = pd.DataFrame() # will contain [ [fullname, isotype, isodecoder, 
                                # isoacceptor, unaligned_sequence] ... ]
validationDataSet = [] # will contain same info as above


for name in isotypeNames:
    isotypeFilter = df.Isotype == name
    poolToChooseFrom = df[isotypeFilter].shape[0]
    
    if poolToChooseFrom != 1:
        numberOfSamples = math.floor(testPercent*poolToChooseFrom)
        fullDataSet[name] = df[isotypeFilter].sample(n=numberOfSamples, replace=False, random_state=randomSeed)
    else:
        if random.randint(0,1) == 0:
            fullDataSet[name] = df[isotypeFilter]
        else:
            fullDataSet[name] = pd.DataFrame()
 
     
for isotype, sampled_frame in fullDataSet.items():
   # print(type(testDataSet), type(sampled_frame))
    testDataSet = pd.concat( [testDataSet, sampled_frame], ignore_index=True, sort=False )

# Positive_Set

In [11]:
testDataSet.to_csv("positive_"+outputFileName,index=False)

# Negative_set

In [12]:
validationDataSet=pd.concat([df,testDataSet]).drop_duplicates(keep=False)
validationDataSet.to_csv("negative_"+outputFileName,index=False)

# =====
# =====
# =====
# No bucket method

In [41]:
########################################
########################################
# Lets do raw 10/90 split of lactococcus
########################################
########################################
lactoFilter = df['species_name'] == 'Lactococcus_lactis'
df = df[lactoFilter]


#Get the number of samples to put into the test DATASET
poolToChooseFrom = df.shape[0]
numberOfSamples = math.floor(testPercent*poolToChooseFrom)

#Generate the test dataset by just grabbing p% from the input pool of tRNAs
testDataSet = df.sample(n=numberOfSamples, replace=False, random_state=randomSeed)

#Generate the validation dataset by doing (wholeDATA - testDATA)
validationDataSet = pd.validationDataSet=pd.concat([df,testDataSet]).drop_duplicates(keep=False)

In [38]:
testDataSet.to_csv("positive_"+outputFileName,index=False)
testDataSet.shape

In [39]:
validationDataSet.to_csv("negative_"+outputFileName,index=False)
validationDataSet.shape