In [1]:
import sys
import pandas as pd
import numpy as np
from typing import List

In [2]:
def load_data(data_fname: str) -> pd.DataFrame:
    """
    Loading data 
    """
    data = pd.read_csv(data_fname, sep='\t')

    return data

In [3]:
def create_sub_rest(data: pd.DataFrame) -> pd.DataFrame:
    """
    Creation of population SubRest for cells that are not included in population A, population B etc.
    """
    sub_names = []
    columns = data.columns
    for column in columns:
        if column.startswith("Sub"):
            sub_names.append(column)
    data['SubRest'] = data['Total'].copy()
    for name in sub_names:
        data['SubRest'] = data['SubRest'] - data[name]

    return data

In [4]:
def calculation_percentage(data: pd.DataFrame) -> (pd.DataFrame, List[str]):
    """
    Calculate the percentage of all populations and record their column names
    """
    sub_names_with_percentage = []
    columns = data.columns    
    for column in columns:
        if column.startswith("Sub"):
            new_column = column + ", %"
            data[new_column] = data[column] / data['Total'] * 100
            sub_names_with_percentage.append(new_column)
            
    return data, sub_names_with_percentage

In [5]:
def coeff_var(x: pd.Series) -> pd.Series:
    """
    Calculation of the coefficient of variation
    """
    
    return np.std(x, ddof = 1) / np.mean(x)

In [6]:
def calc_mean_and_coeff_var(data: pd.DataFrame, population_names: List[str]) -> pd.DataFrame:
    """
    Calculation of the mean and coefficient of variation of the percentage of populations
    """
    res = data[['Sample', *population_names]].groupby("Sample").agg(['mean', coeff_var])
    
    return res

In [7]:
def save_data(data: pd.DataFrame, file_name: str):
    """
    Save data to file
    """
    data.to_csv(file_name, sep='\t')
    
    

In [8]:
data_fname = "data.txt"
data = load_data(data_fname)
data_with_sub_rest = create_sub_rest(data)
data_with_percentage, columns_names_with_percentage = calculation_percentage(data_with_sub_rest)


In [9]:
result = calc_mean_and_coeff_var(data_with_percentage, columns_names_with_percentage)

In [11]:
save_data(result,'results.txt')

In [12]:
result

Unnamed: 0_level_0,"SubA, %","SubA, %","SubB, %","SubB, %","SubRest, %","SubRest, %"
Unnamed: 0_level_1,mean,coeff_var,mean,coeff_var,mean,coeff_var
Sample,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,11.789853,0.004608,0.301639,0.010319,87.908508,0.000597
2,14.800857,0.010406,1.021262,0.017799,84.177881,0.001626
3,30.129813,0.058809,0.4754,0.041448,69.394787,0.025402
5,41.893618,0.011468,4.240751,0.065922,53.86563,0.014076
7,17.300223,0.024744,0.718245,0.039373,81.981532,0.005484
8,32.56718,0.181855,13.00782,0.103842,54.424999,0.133493
9,7.07625,0.004074,0.3476,0.06192,92.57615,0.000284
10,10.882799,0.03044,1.098746,0.019707,88.018455,0.003718
13,19.991183,0.007296,0.53957,0.047047,79.469246,0.002108
14,25.037465,0.007691,4.605756,0.043019,70.35678,0.005412
