In [32]:
# Load your CSV / Excel file
df_kova_version = pd.read_csv(r"C:\Users\skova\OneDrive\Desktop\Dataset\DuplicatesRemovedInvalidRemovedData.csv")  # or use pd.read_excel("file.xlsx")
print("Data loaded successfully!")


import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

print("============================================================")
print("STEP 3: FEATURE ENGINEERING & COLUMN CLEANING")
print("(NO ONE-HOT ENCODING)")
print("============================================================\n")

# 1️⃣ Drop unnecessary columns
columns_to_drop = [
    'mistriage', 'invalid_flag', 'outlier_flag',
    'borderline_flag', 'review_remove', 'keep', 'invalid_reason'
]

columns_to_drop = [col for col in columns_to_drop if col in df_kova_version.columns]
df_kova_version.drop(columns=columns_to_drop, inplace=True)

print(f"Dropped columns: {columns_to_drop}")

# 2️⃣ Create derived feature: Shock Index
if 'HR' in df_kova_version.columns and 'SBP' in df_kova_version.columns:
    df_kova_version['Shock_Index'] = df_kova_version['HR'] / df_kova_version['SBP']
    print("Derived feature 'Shock_Index' created ✅")

# 3️⃣ Encode categorical columns as numbers (Ordinal Encoding)
categorical_cols = df_kova_version.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns encoded: {categorical_cols}")

encoder = OrdinalEncoder()
df_kova_version[categorical_cols] = encoder.fit_transform(df_kova_version[categorical_cols])

print("Categorical variables encoded using Ordinal Encoding ✅")

# 4️⃣ Scale numeric features
numeric_cols = df_kova_version.select_dtypes(include=['int64', 'float64']).columns.tolist()
scaler = StandardScaler()
df_kova_version[numeric_cols] = scaler.fit_transform(df_kova_version[numeric_cols])

print("Numeric features scaled using StandardScaler ✅")

print(f"\nFinal DataFrame shape: {df_kova_version.shape}")

# 5️⃣ Save to CSV
df_kova_version.to_csv("Processed_Kova_Data_NoHotEncoding.csv", index=False)
print("✓ Dataset saved as 'Processed_Kova_Data_NoHotEncoding.csv'")


Data loaded successfully!
STEP 3: FEATURE ENGINEERING & COLUMN CLEANING
(NO ONE-HOT ENCODING)

Dropped columns: ['mistriage', 'invalid_flag', 'outlier_flag', 'borderline_flag', 'review_remove', 'keep', 'invalid_reason']
Derived feature 'Shock_Index' created ✅
Categorical columns encoded: ['Chief_complain', 'Diagnosis in ED']
Categorical variables encoded using Ordinal Encoding ✅
Numeric features scaled using StandardScaler ✅

Final DataFrame shape: (1267, 24)
✓ Dataset saved as 'Processed_Kova_Data_NoHotEncoding.csv'


In [33]:
# Export the cleaned and processed dataset to CSV
output_file = "Processed_Kova_Data.csv"
df_kova_version.to_csv(output_file, index=False)  # index=False avoids adding row numbers as a column

print(f"✓ Dataset successfully saved as '{output_file}'")


✓ Dataset successfully saved as 'Processed_Kova_Data.csv'
