In [4]:
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr   
from TCGA_code import match_computation as m


In [None]:
'''
The software takes in two input gene expression files and analyzes how the gene samples relate to each other.

The goal is to check if an (input) gene profile shows similiarity with known cancer gene profiles. 

1. Load in data
2. Pre-Process data
3. Compute expression level analysis

'''

In [9]:
# 1. Load in data from input files. The files are expected to be in the following format: 
# columns: [symbol, value]
# row: genes

file_name = "example_input_sample.csv"
#file_name = "kidney_cancer_GEP.csv"  # real data set
profile_df = m.read_expr_profile(file_name)

file_name = "example_profile.csv"
#file_name = "breast_cancer_GEP.csv"  # real data set
TCGA_df = m.read_TCGA_sample(file_name)

print(TCGA_df.iloc[:,1])
TCGA_df = m.normalize_profile(TCGA_df, "z-score")
print(TCGA_df.iloc[:,1])

0      2.5
1      9.0
2     10.0
3      2.0
4      2.0
5      3.0
6      4.0
7      3.0
8      2.0
9      9.0
10    10.0
11     5.0
12    20.0
13     5.0
14     8.0
15    12.0
16     4.0
17     1.0
18    41.0
19    12.0
20     6.0
21    11.0
22     6.5
23     2.0
24     2.0
25     5.0
26     6.0
27     8.0
28    23.0
29     4.0
30     5.0
31     2.0
32     2.0
33     9.0
34     3.0
35     4.0
36     3.0
37     2.0
Name: value, dtype: float64


AttributeError: module 'TCGA_code.match_computation' has no attribute 'normalize_profile'

In [None]:
# 2. Process the data - Make them be the same length by either removing missing information or adding 
#    zero expression values. These functions check both input data files both ways.

profile, sample, missing_TCGA = m.check_TCGA(profile_df, TCGA_df, add_missing = True, output = False)
profile, sample, missing_reference = m.check_profile(profile, sample, add_missing = True, output = False)


In [None]:
# 3. Compute the correlation

distance = m.compute_distance(profile, sample)

print("The input expression levels show a correlation value of:", distance, "when zero expression levels are added.")
print("Genes that are missing in the reference profile are:\n", missing_TCGA)
print("Genes that are missing in the TCGA profile are:\n", missing_reference)