# Import libs

In [1]:
import pandas as pd
from pathlib import Path

# Define Paths

In [2]:
data_path = Path("./window_funtions_data/widnow_funtions_example_data.csv").resolve()

# Load data

In [3]:
data = pd.read_csv(data_path)
data.sample(3) # random 3 rows

Unnamed: 0,sample_code,serie,# Nucleus,tissue type,# foci in nucleus,ki_67_subclass
18,s3,0,2,ovarian,30,non_replicating
3,s1,1,1,lung,0,high
0,s1,0,1,lung,10,high


# Example usecases

## I) H-score (per tissue type)
The H-score is a reliable metric calculated as follows (info: percentage is taken from all cells in population): 
</br></br>&nbsp;&nbsp;&nbsp; 
    $ Hscore = (1 × percentage-of-weak-staining) + 
    (2 × percentage-of-moderate-staining) + 
    (3 × percentage-of-strong-staining)$ 
</br> </br>
within the target region, ranging from 0 to 300

### 1) calculate ki67 subclass percent

In [4]:
partition_by = "tissue type"
feature_column = "ki_67_subclass"

data = data.assign(
    percent_of_weak = data.groupby(partition_by)[feature_column].transform(lambda x: (x == "low").sum() / x.size * 100),
    percent_of_moderate = data.groupby(partition_by)[feature_column].transform(lambda x: (x == "mid").sum() / x.size * 100),
    percent_of_strong = data.groupby(partition_by)[feature_column].transform(lambda x: (x == "high").sum() / x.size * 100),
)
data.head(10)

Unnamed: 0,sample_code,serie,# Nucleus,tissue type,# foci in nucleus,ki_67_subclass,percent_of_weak,percent_of_moderate,percent_of_strong
0,s1,0,1,lung,10,high,42.857143,0.0,28.571429
1,s1,0,2,lung,50,low,42.857143,0.0,28.571429
2,s1,0,3,lung,5,non_replicating,42.857143,0.0,28.571429
3,s1,1,1,lung,0,high,42.857143,0.0,28.571429
4,s1,1,2,lung,1,low,42.857143,0.0,28.571429
5,s1,1,3,lung,10,low,42.857143,0.0,28.571429
6,s1,1,4,lung,20,non_replicating,42.857143,0.0,28.571429
7,s2,0,1,ovarian,1,mid,26.666667,33.333333,6.666667
8,s2,0,2,ovarian,44,mid,26.666667,33.333333,6.666667
9,s2,0,5,ovarian,12,non_replicating,26.666667,33.333333,6.666667


### 2) calculate H-score 

In [5]:
data["H-score (tissue type)"] = (1 * data["percent_of_weak"]) + (2 * data["percent_of_moderate"]) + (3 * data["percent_of_strong"])
data.head(10)

Unnamed: 0,sample_code,serie,# Nucleus,tissue type,# foci in nucleus,ki_67_subclass,percent_of_weak,percent_of_moderate,percent_of_strong,H-score (tissue type)
0,s1,0,1,lung,10,high,42.857143,0.0,28.571429,128.571429
1,s1,0,2,lung,50,low,42.857143,0.0,28.571429,128.571429
2,s1,0,3,lung,5,non_replicating,42.857143,0.0,28.571429,128.571429
3,s1,1,1,lung,0,high,42.857143,0.0,28.571429,128.571429
4,s1,1,2,lung,1,low,42.857143,0.0,28.571429,128.571429
5,s1,1,3,lung,10,low,42.857143,0.0,28.571429,128.571429
6,s1,1,4,lung,20,non_replicating,42.857143,0.0,28.571429,128.571429
7,s2,0,1,ovarian,1,mid,26.666667,33.333333,6.666667,113.333333
8,s2,0,2,ovarian,44,mid,26.666667,33.333333,6.666667,113.333333
9,s2,0,5,ovarian,12,non_replicating,26.666667,33.333333,6.666667,113.333333


# II) TopN mean per tissue type

In [7]:
partition_by = "tissue type"
feature_column = "# foci in nucleus"
n = 3

data[f"top_{n}_foci_mean"] = data.groupby(partition_by)[feature_column].transform(lambda x: x.sort_values(ascending=False).head(3).mean())
data.head(10)

Unnamed: 0,sample_code,serie,# Nucleus,tissue type,# foci in nucleus,ki_67_subclass,percent_of_weak,percent_of_moderate,percent_of_strong,H-score (tissue type),top_3_foci_mean
0,s1,0,1,lung,10,high,42.857143,0.0,28.571429,128.571429,26.666667
1,s1,0,2,lung,50,low,42.857143,0.0,28.571429,128.571429,26.666667
2,s1,0,3,lung,5,non_replicating,42.857143,0.0,28.571429,128.571429,26.666667
3,s1,1,1,lung,0,high,42.857143,0.0,28.571429,128.571429,26.666667
4,s1,1,2,lung,1,low,42.857143,0.0,28.571429,128.571429,26.666667
5,s1,1,3,lung,10,low,42.857143,0.0,28.571429,128.571429,26.666667
6,s1,1,4,lung,20,non_replicating,42.857143,0.0,28.571429,128.571429,26.666667
7,s2,0,1,ovarian,1,mid,26.666667,33.333333,6.666667,113.333333,63.333333
8,s2,0,2,ovarian,44,mid,26.666667,33.333333,6.666667,113.333333,63.333333
9,s2,0,5,ovarian,12,non_replicating,26.666667,33.333333,6.666667,113.333333,63.333333
