In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<span style="font-size:30px">**Data quality control and statistical analysis pipeline (Python)**</span>

# Data processing pipeline

This notebook contains a **simple yet complete data processing pipeline** that runs quality control on names and statistical analysis on biological measurements. It is written in **pure Python**. 

## Functions Overview

The pipeline consists of **three functions**:

### QC_name
Runs quality control on the names of individuals to which a given biological treatment was applied. 
A name is considered valid if it meets **all** of the following criteria:
- is of type string
- has a minimal length of 4 characters
- does not contain any space
- contains at least one letter

### measure_analysis
Runs statistical analysis on biological measurements, **regardless of the validity of the names**. 
A value is considered valid if:
- it is of type 'int' or 'float'
- it is strictly positive
Valid values are counted, summed, and used to calculate the mean. **Invalid values are ignored but counted**.

### analysis_pipeline
Orchestrates the first two functions and returns a dictionary containing:
- the results of the name QC
- the statistical analysis of the measurements

<span style="font-size:28px">**1) Name quality control**</span>

In [2]:
# No external libraries needed

def QC_name(sample):
    """
    Runs quality control on the names. 
    A valid name is of type string, is at least 4 characters long, does not contain
    any space character and has at least one letter. 
    -----------
    Parameters: 
    sample: a list of tuples.
    Each tuples is structured as follows: (name, condition,value)
    -----------
    Returns a dictionary containing the count of valid names, of invalid names and a list of valid names.
    """

    QC_dict = {"valid": 0,
               "invalid" : 0,
              "valid_names": []
              }

    for name, _, _ in sample:
        if isinstance(name, str):
            if len(name) >= 4 and " " not in name and any(c.isalpha() for c in name):
                QC_dict["valid"] += 1 
                QC_dict["valid_names"].append(name)
            else: 
                QC_dict["invalid"] += 1 

        else:
            QC_dict["invalid"] += 1

    return QC_dict

<span style="font-size:28px">**2) Measurement analysis**</span>

In [3]:
def measure_analysis(sample): 
    """
    Runs group aggregation with error management. Statistics calculated per condition:
    - count of valid values 
    - sum of valid values 
    - mean (not calculated if there is no valid value)
    Invalid values are ignored and counted separately.
    -----------
    Parameters:
    
    sample: a list of tuples.
    Each tuple is structured as follows: (name, condition,value)
    A condition is a biological treatment applied to an individual.
    -----------
    Returns a dictionary indexed by condition, containing the statistical parameters calculated
    and the count of invalid values.
    """

    dict_analysis = {}

    for _, condition, value in sample:
        if condition not in dict_analysis:
            if value != None:
                if isinstance(value, (int,float)):
                    if value > 0:
                        dict_analysis[condition] = {"count": 1, "sum": value, "invalid_values": 0}
                    else:
                        dict_analysis[condition] = {"count": 0, "sum": 0, "invalid_values": 1}
                else:
                    dict_analysis[condition] = {"count": 0, "sum": 0, "invalid_values": 1}
            else: 
                dict_analysis[condition] = {"count": 0, "sum": 0, "invalid_values": 1}
        else: 
            if value != None:
                if isinstance(value, (int,float)):
                    if value > 0:
                        dict_analysis[condition]["count"] += 1
                        dict_analysis[condition]["sum"] += value 
                    else:
                        dict_analysis[condition]["invalid_values"] += 1
                else:
                    dict_analysis[condition]["invalid_values"] += 1
            else:
                dict_analysis[condition]["invalid_values"] += 1

    
    for condition in dict_analysis:
        if dict_analysis[condition]["count"] != 0:
            dict_analysis[condition]["mean"] = dict_analysis[condition]["sum"]/dict_analysis[condition]["count"]
        else:
            dict_analysis[condition]["mean"] = None

    return dict_analysis

<span style="font-size:28px">**3) Pipeline orchestration**</span>

In [4]:
def analysis_pipeline(sample):
    """
    Orchestrating function that runs name QC and measurement statistical analysis. 
    -----------
    Parameters:
    sample: a list of tuples.
    Each tuple is structured as follows: (name, condition,value)
    -----------
    Returns a dictionary containing the name QC results and the statistical analysis. 
    """
    pipeline_dict = {"QC_names": QC_name(sample),
                    "Measurements_analysis": measure_analysis(sample)}
    return pipeline_dict

<span style="font-size:28px">**4) Test case**</span>

The following example illustrates how the pipeline can be applied to a dataset.

In [5]:
output = analysis_pipeline([("Lola", "ctrl", 3), 
                ("Johnny", "ctrl", 3.8), 
                ("3882K", "treatment", 5.6), 
                ("Helen", "ctrl", 4.5), 
                ("JDIZ2", "treatment", 6.2),
                ("Mike Scavo", "ctrl", 3.4),
                (4, "treatment", 5.9),
                ("Emily", "ctrl", None),
                  ("4682", "treatment", 4.0),
                  ("Jimmy", "treatment", "NA"),
                  ("Beth", "ctrl", -2.7),
                  ("Michael", "ctrl", 4.6),
                  ("Rachel", "treatment", 6.1)])

In [6]:
print(output)

{'QC_names': {'valid': 10, 'invalid': 3, 'valid_names': ['Lola', 'Johnny', '3882K', 'Helen', 'JDIZ2', 'Emily', 'Jimmy', 'Beth', 'Michael', 'Rachel']}, 'Measurements_analysis': {'ctrl': {'count': 5, 'sum': 19.3, 'invalid_values': 2, 'mean': 3.8600000000000003}, 'treatment': {'count': 5, 'sum': 27.800000000000004, 'invalid_values': 1, 'mean': 5.5600000000000005}}}


<span style="font-size:28px">**5) Conclusion**</span>

This notebook demonstrates how a small, well-structured Python pipeline can be used to clean, validate, and analyze raw data without relying on external libraries. 

Possible extensions include:
- stricter validation rules
- additional statistical metrics
- integration with pandas or numpy for larger datasets
