In [10]:
import pandas as pd

def test_column_names_cleaned():
    # Load the dataset
    df = pd.read_csv('../data/raw/HGG_DB.csv')
    
    # Clean column names
    df.columns = df.columns.str.strip() \
                            .str.lower() \
                            .str.replace(' ', '_') \
                            .str.replace('[^a-z0-9_]', '', regex=True)
    
    # Run assertions for all columns
    for col in df.columns:
        assert ' ' not in col and col.islower(), f"Column '{col}' not cleaned properly."

    # Print this only if all assertions pass
    print("All column names cleaned successfully!")

# Call the test function
test_column_names_cleaned()



def test_missing_values_filled():
    df = pd.read_csv('../data/raw/HGG_DB.csv')
    df.fillna('None', inplace=True)
    assert df.isnull().sum().sum() == 0, "Not all missing values were replaced."


All column names cleaned successfully!


In [12]:
import pandas as pd
import numpy as np

def test_fill_missing_values_with_none():
    """
    Test to ensure all missing values in the dataset are replaced with 'None'.
    """
    # Load the dataset
    df = pd.read_csv('../data/raw/HGG_DB.csv')
    
    # Simulate the actual replacement process from your code
    df_filled = df.fillna('None')  # Replace missing values with 'None'

    # Check if there are still any missing values
    remaining_missing = df_filled.isnull().sum().sum()  # Count total missing values
    
    # Assert: No missing values should remain
    assert remaining_missing == 0, f"There are still {remaining_missing} missing values in the dataset."
    
    # Check that all previously missing values are filled with 'None'
    missing_replaced_correctly = (df_filled == 'None').sum().sum() > 0
    assert missing_replaced_correctly, "Missing values were not replaced with 'None' as expected."
    
    # Print success message and a sample of the dataset
    print("All missing values successfully replaced with 'None'.")
    print("\nSample of data after filling missing values:")
    print(df_filled.head())

# Call the test function
test_fill_missing_values_with_none()


All missing values successfully replaced with 'None'.

Sample of data after filling missing values:
       Sample DIPG/NBS-HGG Location Tumor Grade <3 yrs  ACVR1  Histone3  \
0  SJHGG059_A         DIPG  midline          IV     No  G328V      None   
1  SJHGG064_A         DIPG  midline          IV     No  G328W      None   
2  SJHGG071_A         DIPG  midline        None     No  G328E      None   
3  SJHGG005_A         DIPG  midline          IV     No  G328V      None   
4  SJHGG106_A         DIPG  midline          IV     No  R258G  H3.1K27M   

     ATRX  BCOR BCORL1  ... NTRK1/2/3  EGFR  PIK3CA       PIK3R1  BRAF   NF1  \
0    None  None   None  ...      None  None   E545K  T576_R577>R  None  None   
1    None  None   None  ...      None  None  H1047R         None  None  None   
2  T1610R  None   None  ...      None  None   E545K         None  None  None   
3    None  None   None  ...      None  None    None         None  None  None   
4    None  None   None  ...      None  None  H104