In [2]:
#Cell 1: Import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy import stats


In [None]:
#Cell 2: Load CTG dataset 
files = os.listdir()
candidates = [f for f in files if f.lower().startswith("ctg") and f.lower().endswith((".xls", ".xlsx", ".csv"))]

if not candidates:
    raise FileNotFoundError("No CTG dataset found. Place CTG.xls / CTG.xlsx / CTG.csv in this folder.")

file_name = candidates[0]
print(" Found dataset file:", file_name)

if file_name.lower().endswith(".csv"):
    df = pd.read_csv(file_name)
    print(" Loaded CSV file.")
else:
    xls = pd.ExcelFile(file_name)
    print("Available sheets:", xls.sheet_names)
    sheet_name = "Raw Data" if "Raw Data" in xls.sheet_names else (
        "Data" if "Data" in xls.sheet_names else xls.sheet_names[0]
    )
    df = pd.read_excel(file_name, sheet_name=sheet_name)
    print(" Loaded sheet:", sheet_name)

print("Shape:", df.shape)
print("Columns:", df.columns[:10].tolist())
df.head()


 Found dataset file: ctg_clean.xlsx
Available sheets: ['ctg_clean']
 Loaded sheet: ctg_clean
Shape: (2128, 1)
Columns: ['FileName,Date,SegFile,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP']


Unnamed: 0,"FileName,Date,SegFile,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP"
0,"Variab10.txt,1996-12-01,CTG0001.txt,240.0,357...."
1,"Fmcs_1.txt,1996-05-03,CTG0002.txt,5.0,632.0,13..."
2,"Fmcs_1.txt,1996-05-03,CTG0003.txt,177.0,779.0,..."
3,"Fmcs_1.txt,1996-05-03,CTG0004.txt,411.0,1192.0..."
4,"Fmcs_1.txt,1996-05-03,CTG0005.txt,533.0,1147.0..."


In [4]:
# Cell 3: Tidy the dataset
def tidy_sheet(df: pd.DataFrame) -> pd.DataFrame:
    cleaned = df.copy()
    cleaned.columns = [str(col).strip() for col in cleaned.columns]
    cleaned = cleaned.dropna(axis=0, how="all").dropna(axis=1, how="all")
    cleaned = cleaned.loc[:, ~cleaned.columns.str.contains("^Unnamed", case=False)]
    cleaned = cleaned.loc[:, ~cleaned.columns.duplicated()]
    return cleaned

sheet2 = tidy_sheet(df)
print("Shape after tidying:", sheet2.shape)
print("Columns:", sheet2.columns.tolist()[:20])
sheet2.head()


Shape after tidying: (2128, 1)
Columns: ['FileName,Date,SegFile,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP']


Unnamed: 0,"FileName,Date,SegFile,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP"
0,"Variab10.txt,1996-12-01,CTG0001.txt,240.0,357...."
1,"Fmcs_1.txt,1996-05-03,CTG0002.txt,5.0,632.0,13..."
2,"Fmcs_1.txt,1996-05-03,CTG0003.txt,177.0,779.0,..."
3,"Fmcs_1.txt,1996-05-03,CTG0004.txt,411.0,1192.0..."
4,"Fmcs_1.txt,1996-05-03,CTG0005.txt,533.0,1147.0..."


In [5]:
# Cell 4: Missing Values Check & Imputation Demo 
missing_summary = sheet2.isna().sum().sort_values(ascending=False)
print("=== Missing Value Summary ===")
print(missing_summary[missing_summary > 0] if missing_summary.sum() > 0 else "✅ No missing values detected.")

# Demo: introduce NaNs into ASTV (or first numeric column), then impute
demo_feature = 'ASTV' if 'ASTV' in sheet2.columns else sheet2.select_dtypes(float).columns[0]
demo_df = sheet2[[demo_feature]].copy()
rng = np.random.default_rng(42)
mask_indices = rng.choice(demo_df.index, size=5, replace=False)
demo_df.loc[mask_indices, demo_feature] = np.nan

before_impute = demo_df.loc[mask_indices]
demo_df[demo_feature] = demo_df[demo_feature].fillna(demo_df[demo_feature].median())
after_impute = demo_df.loc[mask_indices]

pd.DataFrame({
    'Index': mask_indices,
    'Before': before_impute.squeeze().values,
    'After': after_impute.squeeze().values
})


=== Missing Value Summary ===
✅ No missing values detected.


IndexError: index 0 is out of bounds for axis 0 with size 0

In [6]:
# Cell 5: Outlier Detection (Boxplot + Z-Scores)
available_features = {'physiological': sheet2.select_dtypes(float).columns.tolist()}
outlier_feature = 'DP' if 'DP' in sheet2.columns else available_features['physiological'][0]

fig, ax = plt.subplots(figsize=(6, 4))
sns.boxplot(x=sheet2[outlier_feature], ax=ax)
ax.set_title(f'Outlier Check: {outlier_feature}')
plt.tight_layout()
plt.show()

dp_scores = stats.zscore(sheet2[outlier_feature].dropna())
outlier_mask = np.abs(dp_scores) > 3
outlier_indices = sheet2[outlier_feature].dropna().index[outlier_mask]

print("Outlier samples:")
display(sheet2.loc[outlier_indices, [outlier_feature]].head())
print("Value counts:")
print(sheet2[outlier_feature].value_counts().head())


IndexError: list index out of range

In [7]:
# Cell 6: Duplicate Rows Check
duplicate_rows = sheet2.duplicated().sum()
print(f"Duplicate rows found: {duplicate_rows}")


Duplicate rows found: 0


In [8]:
#Cell 7: Drop Label Leakage Columns (A-SUSP, CLASS, etc.)
label_leak_cols = ['CLASS', 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP']
columns_to_drop = [col for col in label_leak_cols if col in sheet2.columns]

print("Dropping label leakage columns:", columns_to_drop)
clean_df = sheet2.drop(columns=columns_to_drop, errors='ignore')
print("Shape after dropping leakage columns:", clean_df.shape)
clean_df.head()


Dropping label leakage columns: []
Shape after dropping leakage columns: (2128, 1)


Unnamed: 0,"FileName,Date,SegFile,b,e,LBE,LB,AC,FM,UC,ASTV,MSTV,ALTV,MLTV,DL,DS,DP,DR,Width,Min,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,NSP"
0,"Variab10.txt,1996-12-01,CTG0001.txt,240.0,357...."
1,"Fmcs_1.txt,1996-05-03,CTG0002.txt,5.0,632.0,13..."
2,"Fmcs_1.txt,1996-05-03,CTG0003.txt,177.0,779.0,..."
3,"Fmcs_1.txt,1996-05-03,CTG0004.txt,411.0,1192.0..."
4,"Fmcs_1.txt,1996-05-03,CTG0005.txt,533.0,1147.0..."


In [9]:
# Cell 8: Save Cleaned and Scaled Datasets
# Save unscaled
clean_df.to_csv("ctg_clean.csv", index=False)
print(" Saved ctg_clean.csv")

# Scale numeric features
num_cols = clean_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in num_cols if c not in ["NSP","CLASS","label"]]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(clean_df[feature_cols])

df_scaled = pd.DataFrame(X_scaled, columns=feature_cols, index=clean_df.index)
for col in clean_df.columns:
    if col not in feature_cols:
        df_scaled[col] = clean_df[col]

df_scaled.to_csv("ctg_clean_scaled.csv", index=False)
print("Saved ctg_clean_scaled.csv")
df_scaled.head()


 Saved ctg_clean.csv


ValueError: at least one array or dtype is required

In [10]:
# Cell 9: Final Cleaning Summary with Charts
import matplotlib.pyplot as plt
import seaborn as sns

print("=== Cleaning Summary ===")
print("Rows, Cols:", clean_df.shape)

# 1️ Missing values check
missing_after = clean_df.isna().sum().sum()
print("Remaining missing values:", missing_after)

# 2️ Class distribution
target_col = "NSP" if "NSP" in clean_df.columns else ("CLASS" if "CLASS" in clean_df.columns else None)
if target_col:
    counts = clean_df[target_col].value_counts().sort_index()
    props = counts / counts.sum()

    print("\nClass distribution:")
    print(counts)
    print("\nClass proportions (%):")
    print((props * 100).round(2))

    # === Prepare tidy data for seaborn ===
    counts_df = counts.reset_index()
    counts_df.columns = ["Class", "Count"]

    props_df = props.reset_index()
    props_df.columns = ["Class", "Proportion"]

    # === Plot class distribution ===
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))

    # Counts barplot
    sns.barplot(data=counts_df, x="Class", y="Count", hue="Class", ax=ax[0], palette="viridis", legend=False)
    ax[0].set_title("Class Distribution (Counts)")
    for i, v in enumerate(counts_df["Count"]):
        ax[0].text(i, v + 5, str(v), ha='center')

    # Proportions barplot
    sns.barplot(data=props_df, x="Class", y="Proportion", hue="Class", ax=ax[1], palette="viridis", legend=False)
    ax[1].set_title("Class Distribution (Proportions)")
    ax[1].set_ylim(0, 1)
    for i, v in enumerate(props_df["Proportion"]):
        ax[1].text(i, v + 0.01, f"{v:.1%}", ha='center')

    plt.tight_layout()
    plt.show()

else:
    print(" Target column not found.")


=== Cleaning Summary ===
Rows, Cols: (2128, 1)
Remaining missing values: 0
 Target column not found.


In [11]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [12]:
# Convert cleaned dataset to Excel
clean_df.to_excel("ctg_clean.xlsx", index=False)
print(" Cleaned dataset saved as ctg_clean.xlsx")


 Cleaned dataset saved as ctg_clean.xlsx


In [13]:
#scaled version

df_scaled.to_excel("ctg_clean_scaled.xlsx", index=False)
print(" Scaled dataset saved as ctg_clean_scaled.xlsx")


NameError: name 'df_scaled' is not defined