### This part is to clean and prepare the data  
* _Fang_ written on 2025 10 02  


In [48]:
# import the necessary packages
# To run this script, you need to have the following packages installed:
# pandas, numpy, seaborn, xlrd (if not, pip install xlrd.)
import pandas as pd
import numpy as np
import seaborn as sb

In [49]:
# import data. 
# note that must clone to local first, 
# may cause error if direct use cloud folder. 
try:
    ori_data = pd.read_excel('../data/CTG.xls', sheet_name='Data', skiprows=1)
    ori_data.head()
except FileNotFoundError:
    print(" 'CTG.xls' not found, check local path.")

In [50]:
ori_data.head()

Unnamed: 0,b,e,AC,FM,UC,DL,DS,DP,DR,Unnamed: 9,...,E,AD,DE,LD,FS,SUSP,Unnamed: 42,CLASS,Unnamed: 44,NSP
0,240.0,357.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,,9.0,,2.0
1,5.0,632.0,4.0,0.0,4.0,2.0,0.0,0.0,0.0,,...,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,,6.0,,1.0
2,177.0,779.0,2.0,0.0,5.0,2.0,0.0,0.0,0.0,,...,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,,6.0,,1.0
3,411.0,1192.0,2.0,0.0,6.0,2.0,0.0,0.0,0.0,,...,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,,6.0,,1.0
4,533.0,1147.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,2.0,,1.0


In [51]:
try:
    # Specify the columns you want to read using 'usecols'
    columns_to_read = [
        'LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 
        'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 
        'Variance', 'Tendency', 'CLASS', 'NSP'
    ]
    useful_data = pd.read_excel('../data/CTG.xls', sheet_name='Data', skiprows=1, usecols=columns_to_read)
    useful_data.head()
except FileNotFoundError:
    print(" 'CTG.xls' not found, check local path.")

In [52]:
useful_data.head()

Unnamed: 0,AC,FM,UC,DL,DS,DP,LB,ASTV,MSTV,ALTV,...,Max,Nmax,Nzeros,Mode,Mean,Median,Variance,Tendency,CLASS,NSP
0,0.0,0.0,0.0,0.0,0.0,0.0,120.0,73.0,0.5,43.0,...,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,9.0,2.0
1,4.0,0.0,4.0,2.0,0.0,0.0,132.0,17.0,2.1,0.0,...,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,6.0,1.0
2,2.0,0.0,5.0,2.0,0.0,0.0,133.0,16.0,2.1,0.0,...,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,6.0,1.0
3,2.0,0.0,6.0,2.0,0.0,0.0,134.0,16.0,2.4,0.0,...,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,6.0,1.0
4,4.0,0.0,5.0,0.0,0.0,0.0,132.0,16.0,2.4,0.0,...,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,2.0,1.0


In [53]:
# Remove rows with any missing values
useful_data = useful_data.dropna()

print("Data after removing rows with missing values:")
print(useful_data.head())
print(useful_data.shape)

Data after removing rows with missing values:
    AC   FM   UC   DL   DS   DP     LB  ASTV  MSTV  ALTV  ...    Max  Nmax  \
0  0.0  0.0  0.0  0.0  0.0  0.0  120.0  73.0   0.5  43.0  ...  126.0   2.0   
1  4.0  0.0  4.0  2.0  0.0  0.0  132.0  17.0   2.1   0.0  ...  198.0   6.0   
2  2.0  0.0  5.0  2.0  0.0  0.0  133.0  16.0   2.1   0.0  ...  198.0   5.0   
3  2.0  0.0  6.0  2.0  0.0  0.0  134.0  16.0   2.4   0.0  ...  170.0  11.0   
4  4.0  0.0  5.0  0.0  0.0  0.0  132.0  16.0   2.4   0.0  ...  170.0   9.0   

   Nzeros   Mode   Mean  Median  Variance  Tendency  CLASS  NSP  
0     0.0  120.0  137.0   121.0      73.0       1.0    9.0  2.0  
1     1.0  141.0  136.0   140.0      12.0       0.0    6.0  1.0  
2     1.0  141.0  135.0   138.0      13.0       0.0    6.0  1.0  
3     0.0  137.0  134.0   137.0      13.0       1.0    6.0  1.0  
4     0.0  137.0  136.0   138.0      11.0       1.0    2.0  1.0  

[5 rows x 23 columns]
(2126, 23)


In [54]:
print(useful_data.shape)

(2126, 23)


In [55]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (70% training, 30% testing)
data_train, data_test = train_test_split(useful_data, test_size=0.3, random_state=42)

print('---training set---\n')
print(data_train.head())

print('---testing set---\n')
print(data_test.head())

---training set---

       AC   FM   UC   DL   DS   DP     LB  ASTV  MSTV  ALTV  ...    Max  Nmax  \
1718  8.0  0.0  2.0  1.0  0.0  0.0  131.0  54.0   1.5   0.0  ...  169.0   7.0   
857   0.0  0.0  4.0  0.0  0.0  0.0  142.0  44.0   0.8   1.0  ...  158.0   1.0   
1075  2.0  0.0  6.0  5.0  0.0  0.0  137.0  27.0   1.6   0.0  ...  167.0   2.0   
371   0.0  4.0  1.0  1.0  0.0  0.0  138.0  55.0   0.7   0.0  ...  151.0   4.0   
222   1.0  6.0  0.0  0.0  0.0  0.0  129.0  47.0   0.9   0.0  ...  156.0   6.0   

      Nzeros   Mode   Mean  Median  Variance  Tendency  CLASS  NSP  
1718     0.0  151.0  142.0   147.0      22.0       1.0    2.0  1.0  
857      0.0  145.0  146.0   147.0       2.0       0.0    1.0  1.0  
1075     0.0  144.0  136.0   141.0      23.0       1.0    6.0  1.0  
371      0.0  144.0  140.0   143.0       5.0       1.0    1.0  1.0  
222      0.0  133.0  133.0   134.0       4.0       1.0    3.0  1.0  

[5 rows x 23 columns]
---testing set---

       AC    FM   UC   DL   DS   DP  

In [56]:
# export training set to CSV
train_csv_path = '../data/manipulated_data_set/train_data.csv'
data_train.to_csv(train_csv_path, index=False)
print(f"Training data exported to {train_csv_path}")

# Export testing set to CSV
test_csv_path = '../data/manipulated_data_set/test_data.csv'
data_test.to_csv(test_csv_path, index=False)
print(f"Testing data exported to {test_csv_path}")

Training data exported to ../data/manipulated_data_set/train_data.csv
Testing data exported to ../data/manipulated_data_set/test_data.csv


In [57]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns (excluding categorical columns like 'CLASS' and 'NSP')
numeric_columns = [
    'LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 
    'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 
    'Variance'
]

# Apply StandardScaler to numeric columns
scaler = StandardScaler()
useful_data[numeric_columns] = scaler.fit_transform(useful_data[numeric_columns])

print("Data after scaling:")
print(useful_data.head())

Data after scaling:
         AC        FM        UC        DL        DS       DP        LB  \
0 -0.764740 -0.195096 -1.285798 -0.628375 -0.057476 -0.27153 -1.352220   
1  0.358852 -0.195096  0.119475  0.172060 -0.057476 -0.27153 -0.132526   
2 -0.202944 -0.195096  0.470793  0.172060 -0.057476 -0.27153 -0.030884   
3 -0.202944 -0.195096  0.822111  0.172060 -0.057476 -0.27153  0.070757   
4  0.358852 -0.195096  0.470793 -0.628375 -0.057476 -0.27153 -0.132526   

       ASTV      MSTV      ALTV  ...       Max      Nmax    Nzeros      Mode  \
0  1.513190 -0.943095  1.802542  ... -2.119592 -0.701397 -0.458444 -1.065614   
1 -1.744751  0.868841 -0.535361  ...  1.893794  0.655137  0.958201  0.216638   
2 -1.802928  0.868841 -0.535361  ...  1.893794  0.316003  0.958201  0.216638   
3 -1.802928  1.208579 -0.535361  ...  0.333033  2.350804 -0.458444 -0.027600   
4 -1.802928  1.208579 -0.535361  ...  0.333033  1.672537 -0.458444 -0.027600   

       Mean    Median  Variance  Tendency  CLASS  NSP 

In [58]:
# export scaled training set to CSV
train_csv_path = '../data/manipulated_data_set/scale_train_data.csv'
data_train.to_csv(train_csv_path, index=False)
print(f"Training data exported to {train_csv_path}")

# Export scaled testing set to CSV
test_csv_path = '../data/manipulated_data_set/scale_test_data.csv'
data_test.to_csv(test_csv_path, index=False)
print(f"Testing data exported to {test_csv_path}")

Training data exported to ../data/manipulated_data_set/scale_train_data.csv
Testing data exported to ../data/manipulated_data_set/scale_test_data.csv


In [3]:
import sys
import os

notebook_dir = os.getcwd()

project_root = os.path.dirname(notebook_dir)

if project_root not in sys.path:
    sys.path.append(project_root)
    
from src.data_cleaning import clean_ctg_data

file_path = 'data/CTG.xls'
df_cleaned, cleaning_report = clean_ctg_data(file_path)

for line in cleaning_report:
    print(line)
    
if df_cleaned is not None:
    print("---- Cleaned DataFrame Head ----")
    display(df_cleaned.head())

ModuleNotFoundError: No module named 'pandas'