In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
from environment import *

import setting_ccle as setting
# import setting_tcga as setting

path_dict = path(setting)

In [None]:
feature_1d_context_matrix = pd.read_table(
    path_dict['feature_context_matrix_file_path'],
    index_col=0,
)

sample_1d_context_matrix = pd.read_table(
    path_dict['sample_context_matrix_file_path'],
    index_col=0,
)

In [None]:
negative_signal_matrix = -feature_1d_context_matrix.clip(upper=0)# + -sample_1d_context_matrix.clip(upper=0).T

negative_signal_matrix.index = ('(-) {}'.format(feature) for feature in negative_signal_matrix.index)

positive_signal_matrix = feature_1d_context_matrix.clip(lower=0)# + sample_1d_context_matrix.clip(lower=0).T

positive_signal_matrix.index = ('(+) {}'.format(feature) for feature in positive_signal_matrix.index)

signal_matrix = pd.concat((
        negative_signal_matrix,
        positive_signal_matrix,
))

signal_matrix.to_csv(
    path_dict['signal_matrix_file_path'],
    sep='\t',
)

signal_matrix

In [None]:
if setting.FEATURES_FILE_PATH is not None:

    features = pd.read_table(
        setting.FEATURES_FILE_PATH,
        squeeze=True,
    ).tolist()

    print('Read {} features.'.format(len(features)))

    negative_signal_matrix = negative_signal_matrix.loc[[index for index in negative_signal_matrix.index if index[4:] in features]]
    
    print(negative_signal_matrix.shape)
    
    positive_signal_matrix = positive_signal_matrix.loc[[index for index in positive_signal_matrix.index if index[4:] in features]]
    
    print(positive_signal_matrix.shape)

In [None]:
if setting.SELECT_FEATURE_AUTOMATICALLY:
    
    feature_negative_sum = negative_signal_matrix.sum(axis=1)
    
    selected_negative_features = feature_negative_sum.index[feature_negative_sum.std() / 2 < feature_negative_sum].tolist()
    
    feature_positive_sum = positive_signal_matrix.sum(axis=1)
    
    selected_positive_features = feature_positive_sum.index[feature_positive_sum.std() / 2 < feature_positive_sum].tolist()
    
    ccal.plot_points(
        (
            'Negative Signal',
            'Positive Signal',
        ),
        (
            tuple(range(negative_signal_matrix.shape[0])),
            tuple(range(positive_signal_matrix.shape[0])),
        ),
        (
            feature_negative_sum.sort_values(),
            feature_positive_sum.sort_values(),
        ),
    )
    
elif setting.N_TOP_FEATURE is not None:
   
    selected_negative_features = list(set(ccal.flatten_nested_iterable(negative_signal_matrix.apply(lambda column: column.dropna().sort_values()[-setting.N_TOP_FEATURE:].index.tolist()))))
    
    selected_positive_features = list(set(ccal.flatten_nested_iterable(positive_signal_matrix.apply(lambda column: column.dropna().sort_values()[-setting.N_TOP_FEATURE:].index.tolist()))))

else:
    
    selected_negative_features = negative_signal_matrix.index.tolist()
    
    selected_positive_features = positive_signal_matrix.index.tolist()

print('Selected {} negative and {} positive features.'.format(
    len(selected_negative_features),
    len(selected_positive_features),
))

In [None]:
if setting.SELECT_CONTEXT == 'negative':
    
    selected_features = selected_negative_features
    
elif setting.SELECT_CONTEXT == 'positive':
    
    selected_features = selected_positive_features
    
elif setting.SELECT_CONTEXT == 'both':
    
    selected_features = selected_negative_features + selected_positive_features

selected_signal_matrix = signal_matrix.loc[selected_features]

In [None]:
from ccal import normalize_nd_array
from pandas import DataFrame


def normalize_signal_matrix(signal_matrix):
    
    return DataFrame(
        normalize_nd_array(
            signal_matrix.values,
            1,
            '0-1',
            raise_for_bad_value=False,
        ),
        index=signal_matrix.index,
        columns=signal_matrix.columns,
    )

In [None]:
selected_signal_matrix = ccal.drop_df_slice(
    selected_signal_matrix,
    1,
    min_n_not_na_unique_value=2,
)

selected_signal_matrix = normalize_signal_matrix(selected_signal_matrix)

selected_signal_matrix.to_csv(
    path_dict['selected_signal_matrix_file_path'],
    sep='\t',
)

selected_signal_matrix

In [None]:
if selected_signal_matrix.size < 1e6:

    ccal.plot_heat_map(
        selected_signal_matrix,
        title='Selected Signal',
        xaxis_title='Sample',
        yaxis_title='Feature',
    )
    
values = selected_signal_matrix.unstack()

print('{:,} values'.format(len(values)))

not_na_values = values.dropna()

print('{:,} not-NA values'.format(len(not_na_values)))

not_na_or_0_values = not_na_values[not_na_values != 0]

print('{:,} not-NA-or-0 values'.format(len(not_na_or_0_values)))
    
ccal.plot_distributions(
    (
        'Not-NA',
        'Not-NA-or-0',
    ),
    (
        not_na_values,
        not_na_or_0_values,
    ),
    plot_rug=False,
    title='Signal Matrix Value Distribution',
)

for element in (
    'feature',
    'sample',
):
    
    if element == 'feature':

        df = selected_signal_matrix

        to_peek = setting.FEATURES_TO_PEEK

    elif element == 'sample':

        df = selected_signal_matrix.T

        to_peek = setting.SAMPLES_TO_PEEK

    skew_t_pdf_fit_parameter = pd.read_table(
        path_dict['{}_skew_t_pdf_fit_parameter_file_path'.format(element)],
        index_col=0,
    )

    signal_summary = df.sum(axis=1).sort_values()

    if to_peek is None:
        
        ranks = []
        
    else:
        
        ranks = np.nonzero([any(peek == index.lstrip('(-+) ') for peek in to_peek) for index in signal_summary.index])[0]
    
    title = 'Selected Signal Sum ({}) '.format(element)

    ccal.plot_points(
        (
            'All',
            'To Peek',
        ),
        (
            tuple(range(signal_summary.size)),
            ranks,
        ),
        (
            signal_summary,
            signal_summary[ranks],
        ),
        texts=(
            signal_summary.index,
            signal_summary.index[ranks],
        ),
        modes=(
            'markers',
            'markers+text',
        ),
        title=title,
        xaxis_title='Rank',
        yaxis_title=title,
    )