In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from IPython.display import display

In [None]:
#Encoding categorical variables -IT24101206(Karunarathne N.W.)

# Clean Level column
df['Level_clean'] = df['Level'].astype(str).str.strip().str.title()
df['Level_clean'] = df['Level_clean'].replace('Nan', np.nan)

# Define categories
categories = [['Low', 'Medium', 'High']]
encoder = OrdinalEncoder(categories=categories)

# Encode Level_clean -> level_encoded
mask = ~df['Level_clean'].isna()
df['level_encoded'] = np.nan
df.loc[mask, 'level_encoded'] = encoder.fit_transform(
    df.loc[mask, ['Level_clean']]
).flatten()

# Drop helper column
df = df.drop(columns=['Level_clean'])

# Show results
print(f"Shape: {df.shape}")
display(df[['Level', 'level_encoded']].head(10))
display(df[['Level', 'level_encoded']].tail(10))
display(df.head())

In [None]:
# Compare original Level vs encoded values
fig, axes = plt.subplots(1, 2, figsize=(12,5))

# Left: Categorical distribution (fixed for Seaborn 0.14+)
sns.countplot(x="Level", hue="Level", data=df, 
              order=["Low", "Medium", "High"], 
              palette="Set2", ax=axes[0], legend=False)
axes[0].set_title("Distribution of Cancer Severity Levels")
axes[0].set_xlabel("Level")
axes[0].set_ylabel("Count")

# Right: Encoded numeric distribution
sns.histplot(df["level_encoded"].dropna(), bins=3, kde=False, ax=axes[1], color="skyblue")
axes[1].set_title("Encoded Level Distribution")
axes[1].set_xlabel("level_encoded (0=Low, 1=Medium, 2=High)")
axes[1].set_ylabel("Count")

plt.tight_layout()
plt.show()