In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from IPython.display import display

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Make a copy of dataset
df_rf = df.copy()

# Encode categorical columns (convert text → numbers)
for col in df_rf.select_dtypes(include=['object']).columns:
    if col != "Level":   # skip target
        le = LabelEncoder()
        df_rf[col] = le.fit_transform(df_rf[col])

# Separate features and target
X = df_rf.drop(columns=["Level"])   
y = df_rf["Level"]                 

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importance
importances = rf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

# ---------------- Display Results ----------------
print(" Top Features Selected:")

# 1. Table View (Top 10 Features)
importance_table = importance_df.head(10).reset_index(drop=True)
display(importance_table)

# 2. Barplot of Top 10 Features (fixed future warning)
plt.figure(figsize=(10,6))
sns.barplot(
    x="Importance", 
    y="Feature", 
    data=importance_df.head(10), 
    hue="Feature",       
    dodge=False, 
    legend=False, 
    palette="viridis"
)
plt.title("Top 10 Features by Random Forest Importance")
plt.show()

# ---------------- EDA on Top Features ----------------
top_features = importance_df.head(5)["Feature"].tolist()  # pick top 5 for visualization

# Boxplots by Level (fixed future warning)
for col in top_features:
    plt.figure(figsize=(8, 5))
    sns.boxplot(
        x="Level", 
        y=col, 
        data=df_rf, 
        hue="Level",          
        dodge=False, 
        legend=False, 
        palette="Set2"
    )
    plt.title(f"Boxplot of {col} by Level")
    plt.show()


In [None]:
# ---------------- Preprocessing ----------------
# Separate features (X) and target (y)
X = df.drop("Level", axis=1)   
y = df["Level"]                

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [None]:
# Pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),   
    ("scaler", StandardScaler())                     
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  
    ("encoder", OneHotEncoder(handle_unknown="ignore"))    
])

In [None]:
# ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
# ---------------- Feature Selection ----------------
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("feature_selection", SelectKBest(score_func=f_classif, k=8))  # select top 8 features
])

# Fit pipeline
X_new = pipeline.fit_transform(X, y)

# ---------------- Get Feature Names ----------------
# Numeric features remain the same
feature_names = list(numeric_features) + list(
    pipeline.named_steps["preprocessor"]
            .named_transformers_["cat"]
            .named_steps["encoder"]
            .get_feature_names_out(categorical_features)
)

# Select top k features
selected_features = [
    feature_names[i]
    for i, mask in enumerate(pipeline.named_steps["feature_selection"].get_support())
    if mask
]

print("Selected Features:", selected_features)

In [None]:
# ---------------- Correlation Heatmap ----------------
# Transform the full dataset (after preprocessing)
X_transformed = pipeline.named_steps["preprocessor"].transform(X)

# Build a DataFrame with encoded + numeric features
X_transformed_df = pd.DataFrame(
    X_transformed.toarray() if hasattr(X_transformed, "toarray") else X_transformed,
    columns=feature_names
)

# Keep only selected features
X_selected_df = X_transformed_df[selected_features]

In [None]:
# Plot correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(X_selected_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Selected Features")
plt.show()