# CTG Data Exploration Notebook Structure

## 1. Imports and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

## 2. Load the Dataset

In [None]:
df = pd.read_excel("../CTG.xls", sheet_name="Raw Data")
print(df.shape)
df.head()

## 3. Basic Interpretation

In [None]:
df.info()
df.describe()
df.isnull().sum().sort_values(ascending=False).head(10)

Checks for:
- Missing Data
- Outliers
- Wrong data types

## 4. Remove Empty Rows & Metadata

In [None]:
df = df.dropna(how="all")
df = df.drop(columns=[
    "FileName", "Date", "SegFile", "LBE", "A", "B", "C", "D", "E",
    "AD", "DE", "LD", "FS", "SUSP", "CLASS"
])
df = df.drop([2128, 2129], errors="ignore")
df["NSP"] -= 1
df.head()

## 5. Class Distribution

In [None]:
sns.countplot(x="NSP", data=df)
plt.title("Class Distribution of NSP")
plt.xlabel("Class (0=Normal, 1=Suspect, 2=Pathologic)")
plt.ylabel("Count")
plt.show()

Shows us the imbalanced classes

## 6. Feature Distributions

In [None]:
df.hist(figsize=(14,12), bins=20)
plt.suptitle("Feature Distributions")
plt.show()

Identify skewed features like **ASTV** , **ALTV**, etc

## 7. Correlation Heatmap

In [None]:
correlation_matrix = df.corr()plt.figure(figsize=(30, 25))sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True, annot_kws={'size': 13})plt.title('Correlation Matrix', fontsize=14)plt.xticks(fontsize=10, rotation=90)plt.yticks(fontsize=10)plt.tight_layout()plt.show()

Shows us which features correlate strongly with **NSP**

## 8. Feature Scaling (StandardScaler)

In [None]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df.drop(columns=["NSP"]))
scaled_df = pd.DataFrame(scaled, columns=[col for col in df.columns if col != "NSP"])
scaled_df["NSP"] = df["NSP"]
scaled_df.head()

## 9. Save Cleaned Data

In [None]:
scaled_df.to_csv("CTG_cleaned.csv", index=False)
print("Cleaned and scaled dataset saved!")

## 10. Summary
- The dataset shows clear imbalance across the three classes (Normal > Suspect > Pathologic).

- Several variability features (ASTV, ALTV) were skewed, hence log-transformed.

- All features were standardized to zero mean and unit variance for model consistency.