In [1]:
%%html
<style>
div.output_stderr{
    display:none
}
</style>
<a id='top'></a>

# Operation of parmeter based functions
* Documentation for *.yml and run_parameters funtions in ../src/mini_pipelines_toolbox.py.

### source code link:
##### (private) source repository:  https://github.com/dlanier/minipipelines.git


### Spreadsheet Transformations:
1. [Subset Expression Based on Phenotype](#select_subtype)
2. [Intersection](#get_common_samples)
3. [Subset Genes](#select_genes)
4. [Union](#merge)
5. [Group Then Apply a Function](#cluster_stats)


### Basic Transformations:
1. [Transpose](#transpose)
2. [Numerical Transformation](#numerical_transformation)
3. [Numerical Details](#stats)


In [2]:
import os
import sys

import pandas as pd
import knpackage.toolbox as kn

import knspreadsheetstransformation.spreadsheets_transformation_pipelines as stp

data_dir = '../data/spreadsheets'
run_data_dir = '../data/run_files'
results_dir = os.path.join(os.path.abspath('../test'), 'run_dir/results')

if not os.path.isdir(results_dir):
    print('Created witout overwrite:', results_dir)
    os.makedirs(results_dir)
    
def clear_results():
    results_files = [f for f in os.listdir(results_dir) if os.path.isfile(os.path.join(results_dir, f))]
    for rf in results_files:
        os.remove(os.path.join(results_dir, rf))

In [3]:
os.listdir('../test/run_dir/')

['results']

In [4]:
os.listdir(data_dir)

['gene_samples_1.tsv',
 'gene_samples_1_list.txt',
 'gene_samples_small.tsv',
 'gene_samples_small_labels.tsv',
 'spreadsheet_A_.G.tsv',
 'spreadsheet_B_.G.tsv',
 'spreadsheet_One.txt',
 'spreadsheet_Two.txt',
 'tcga_ucec_somatic_mutation_data.df',
 'transform_5_spreadsheet.tsv',
 'UCEC_phenotype.txt']

# Spreadsheet Transformations

<a id='select_subtype'></a>

## Subset Expression Based on Phenotype [[top]](#top)
### run_select_subtype_df(run_parameters)
    TEST_6_select_categorical.yml
    From a genes x samples spreadsheet and a samples x phenotypes spreadsheet, return both spreadsheets with
    only the samples corresponding to a category in a phenotype and write to output files names specified.

    
#### Required Keys:
    method:                                   select_subtype_df

    spreadsheet_file_name:                    ../data/spreadsheets/transform_5_spreadsheet.tsv
    phenotype_file_name:                      ../data/spreadsheets/spreadsheet_Two.txt
    phenotype_id:                             "grade"
    select_category:                          "grade 3"

    results_directory:                        ./run_dir/results

In [6]:
clear_results()
run_file = 'TEST_6_select_categorical.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_select_subtype_df(run_parameters)
print('Result Files:')
os.listdir(results_dir)

Run Parameters:
                        method: select_subtype_df
           phenotype_file_name: ../data/spreadsheets/spreadsheet_Two.txt
             results_directory: /Users/mojo/git_clone/dlanier/minipipelines/test/run_dir/results
                 run_directory: ../data/run_files
         spreadsheet_file_name: ../data/spreadsheets/transform_5_spreadsheet.tsv
                  phenotype_id: grade
               select_category: grade 3
                      run_file: TEST_6_select_categorical.yml
Result Files:


['spreadsheet_Two_phenotype_category_Mon_11_Sep_2017_08_16_52.398237943.tsv',
 'transform_5_spreadsheet_phenotype_category_Mon_11_Sep_2017_08_16_52.397414922.tsv']

<a id='get_common_samples'></a>

## Intersection [[top]](#top)
### run_common_samples_df(run_parameters)
    TEST_2_common_samples.yml
    find common sample names in two spreadsheets, write to output files names specified

#### Required Keys:
    method:                     common_samples_df

    spreadsheet_1_file_name:    ../data/spreadsheets/spreadsheet_One.txt
    spreadsheet_2_file_name:    ../data/spreadsheets/spreadsheet_Two.txt

    results_directory:          ./run_dir/results

In [None]:
clear_results()
run_file = 'TEST_2_common_samples.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_common_samples_df(run_parameters)
print('Result Files:')
os.listdir(results_dir)

<a id="select_genes"></a>

## Subset Genes [[Top]](#top)
### run_select_genes(run_parameters)
    TEST_4_select_genes.yml
    Spreadsheet with only those genes selected from an input list.

#### Required Keys:
    method:                 select_genes_df

    spreadsheet_file_name:  ../data/spreadsheets/gene_samples_1.tsv
    gene_list_file_name:    ../data/spreadsheets/gene_samples_1_list.txt

    results_directory:      ./run_dir/results

In [None]:
clear_results()
run_file = 'TEST_4_select_genes.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_select_genes(run_parameters)
print('Result Files:')
os.listdir(results_dir)

<a id='merge'></a>

## Union [[top]](#top)
### run_merge_df(run_parameters)
    TEST_3_merge.yml
    Combine two spreadsheets into one with all samples and phenotypes  and write to output file name specified

#### Required Keys:
    method:                     merge_df

    spreadsheet_1_file_name:    ../data/spreadsheets/spreadsheet_One.txt
    spreadsheet_2_file_name:    ../data/spreadsheets/spreadsheet_Two.txt

    results_directory:          ./run_dir/results


In [None]:
clear_results()
run_file = 'TEST_3_merge.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_merge_df(run_parameters)
print('Result Files:')
os.listdir(results_dir)

<a id='cluster_stats'></a>

## Group Then Apply a Function [[top]](#top)
### run_cluster_statistics_df(run_parameters)
    TEST_5_cluster_averages.yml
    Dataframe of averages for each category in a genes x samples dataframe with a samples classification dictionary.

#### Required Keys:
    method:                       cluster_statistics_df

    centroid_statistic:           'median' # ['std', 'median', 'mean']

    spreadsheet_file_name:     ../data/spreadsheets/gene_samples_small.tsv
    sample_labels_file_name:   ../data/spreadsheets/gene_samples_small_labels.tsv

    results_directory:            ./run_dir/results

In [None]:
clear_results()
run_file = 'TEST_5_cluster_averages.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_cluster_statistics_df(run_parameters)
print('Result Files:')
os.listdir(results_dir)

# Basic transformations

<a id='transpose'></a>

## Transpose [[top]](#top)
### run_transpose(run_parameters)
    TEST_1_transpose.yml
    transpose a single spreadsheet and write to output file name specified

#### Required Keys:
    method:                      transpose_df

    spreadsheet_name_full_path:  ../data/spreadsheets/spreadsheet_One.txt
        results_directory:           ./run_dir/results

In [None]:
clear_results()
run_file = 'TEST_1_transpose.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_transpose(run_parameters)
print('Result Files:')
os.listdir(results_dir)

<a id='numerical_transformation'></a>

## Numerical Transformation [[top]](#top)
### run_spreadsheet_numerical_transform(run_parameters)
    TEST_7_numerical_transform.yml

#### Required Keys:
    method:                      numeric_transform

    spreadsheet_name_full_path:  ../data/spreadsheets/spreadsheet_A_.G.tsv
    results_directory:           ./run_dir/results

    numeric_function:            abs # [abs, z_transform, log_transform, threshold]
    
#### Method Specific Keys:
    z_transform_axis:             1
    z_transform_ddof:             0

    log_transform_log_base:       e   # e for natural log or a positive number
    log_transform_log_offset:     0

    threshold_cut_off:            0.5
    threshold_substitution_value: 0
    threshold_scope:              SUB_BELOW # [SUB_BELOW, SUB_ABOVE]

In [None]:
clear_results()
run_file = 'TEST_7_numerical_transform.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_spreadsheet_numerical_transform(run_parameters)
print('Result Files:')
os.listdir(results_dir)

<a id='stats'></a>

## Numerical Details [[top]](#top)
### run_stats_df(run_parameters)
    TEST_8_stat_value.yml

#### Required Keys:
    method:                 stats_df

    spreadsheet_file_name:  ../data/spreadsheets/gene_samples_1.tsv

    stats_function:          sum     # ['min', 'max', 'mean', 'median', 'variation', 'std_deviation', 'sum']
    direction_reference:     columns # ['columns', 'rows', 'all']

    results_directory:      ./run_dir/results

In [None]:
clear_results()
run_file = 'TEST_8_stat_value.yml'
run_parameters = kn.get_run_parameters(run_data_dir, run_file)
run_parameters['results_directory'] = results_dir
print('Run Parameters:')
for k, v in run_parameters.items():
    print('%30s: %s'%(k, v))

stp.run_stats_df(run_parameters)
print('Result Files:')
os.listdir(results_dir)