In [4]:
import pyPING_objects as ping
import pyPING_checks as check
import pyPING_supporting as support
import os

In [5]:
sample_location = '../RAW_SEQUENCES/'
bowtie_threads = 8
sequence_group = os.path.basename(os.path.normpath(sample_location))
results_directory = ''
min_read_cutoff = 10000

In [6]:
# Check that bowtie2 can be accessed
check.bowtie2()

results_directory = support.build_results_directory(results_directory, sequence_group)

sample_name_list, sample_path_list = support.sample_finder(sample_location)
sample_dictionary = support.match_finder(sample_name_list, sample_path_list)
sample_list = support.sample_builder(sample_dictionary, sequence_group, results_directory)


for sample in sample_list:
    sample.kir_extract(os.path.normpath('KIR_sequences/'), bowtie_threads)
    print(sample)
    print(sample.counts.read_count_total)

bowtie2 has been found and can be accessed.

Automatically finding paired sequences.

It took 0.0003161430358886719s to match 3 pairs.



KIR reads found for: IND03978_S75_L001_R

Name: IND03978_S75_L001_R
Group: RAW_SEQUENCES
Results Directory: Results/RAW_SEQUENCES_results
File Location: ../RAW_SEQUENCES
KIR extracted: True

139405.0


KIR reads found for: IND03977_S74_L001_R

Name: IND03977_S74_L001_R
Group: RAW_SEQUENCES
Results Directory: Results/RAW_SEQUENCES_results
File Location: ../RAW_SEQUENCES/IND03977-43208468
KIR extracted: True

138294.0


KIR reads found for: IND03975_S72_L001_R

Name: IND03975_S72_L001_R
Group: RAW_SEQUENCES
Results Directory: Results/RAW_SEQUENCES_results
File Location: ../RAW_SEQUENCES/Even_Deeper/IND03975-43224285
KIR extracted: True

77252.0


In [118]:
import pandas as pd
import numpy as np
import gzip
import os
import time

In [30]:
probelist_path = os.path.normpath('Reference/Probes/Probelist.csv')
kff_threshold = 0.2

probelist_table = pd.read_csv(probelist_path,header=0)

## Get a list of sample names for initializing DataFrames
sample_table_names = [sample.name for sample in sample_list]

## Initialize dataframe for match counting
sample_probe_match_table = pd.DataFrame(
    np.zeros((len(sample_list),len(probelist_table))),
    index = sample_table_names,
    columns = probelist_table.loc[:,'Name'])

In [123]:
start = time.time()

for sample in sample_list:
    print("Performing probe matching for: " + sample.name + ".")

    for index in probelist_table.index:
        probe_string = probelist_table.iloc[index,:].Sequence.encode()
    
        with gzip.open(os.path.normpath(sample.kir_path_fastq_1), 'rb') as f:
            for line in f:
                if probe_string in line:
                    probe_name = probelist_table.iloc[index,:].Name
                    sample_probe_match_table.loc[sample.name,probe_name] += 1
    
        with gzip.open(os.path.normpath(sample.kir_path_fastq_2), 'rb') as f:
            for line in f:
                if probe_string in line:
                    probe_name = probelist_table.iloc[index,:].Name
                    sample_probe_match_table.loc[sample.name,probe_name] += 1
    print("\n")

print("Finished with probe matching.")
end = time.time()
print(end - start)

Performing probe matching for: IND03978_S75_L001_R.


Performing probe matching for: IND03977_S74_L001_R.


Performing probe matching for: IND03975_S72_L001_R.


Finished with probe matching.
8834.737545013428


In [150]:
kff_loci_list = ['>2DL1_', '>2DL23_', '>2DL4_', '>2DL5', '>2DP1_', '>2DS1_', '>2DS2_', '>2DS3_', '>2DS4_', '>2DS5_', 
                '>3DL1_', '>3DL2_', '>3DL3_', '>3DP1_', '>3DS1_']

sample_probe_ratio_table = pd.DataFrame(
    np.zeros((len(sample_list),len(kff_loci_list))),
    index=sample_table_names,
    columns=kff_loci_list)

sample_probe_3DL3_sum_table = sample_probe_match_table.loc[:,[column for column in sample_probe_match_table.columns if '>3DL3_' in column]].sum(axis=1)

for current_locus in kff_loci_list:

    current_locus_sum_table = sample_probe_match_table.loc[:,[column for column in sample_probe_match_table.columns if current_locus in column]].sum(axis=1)

    sample_probe_ratio_table.loc[:,current_locus] = current_locus_sum_table / sample_probe_3DL3_sum_table

sample_probe_bool_table = sample_probe_ratio_table > kff_threshold
sample_probe_bool_table = sample_probe_bool_table*1

## Finished product
sample_probe_bool_table

Unnamed: 0,>2DL1_,>2DL23_,>2DL4_,>2DL5,>2DP1_,>2DS1_,>2DS2_,>2DS3_,>2DS4_,>2DS5_,>3DL1_,>3DL2_,>3DL3_,>3DP1_,>3DS1_
IND03978_S75_L001_R,0,1,1,1,0,0,1,0,0,1,1,1,1,1,1
IND03977_S74_L001_R,1,1,1,1,1,0,1,1,0,0,1,1,1,1,1
IND03975_S72_L001_R,1,1,1,0,1,0,1,0,1,0,1,1,1,0,1
