In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
df = pd.read_csv("adult_dataset_balanced.csv")

# Check dataset balance
print("Class Distribution:\n", df["ASD_Class"].value_counts())

# Define Features (X) and Target (y)
X = df.drop(columns=["ASD_Class"])
y = df["ASD_Class"]

# Save feature order for app.py
joblib.dump(X.columns.tolist(), "model_features.pkl")
print("✅ Feature names saved successfully!")

# Apply SMOTE to ensure balance
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "scaler.pkl")  # Save scaler

# Train Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)
joblib.dump(dt_model, "decision_tree.pkl")

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
joblib.dump(rf_model, "random_forest.pkl")

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)
joblib.dump(xgb_model, "xgboost.pkl")

print("✅ Models retrained and saved successfully!")

Class Distribution:
 ASD_Class
1    429
0    429
Name: count, dtype: int64
✅ Feature names saved successfully!
✅ Models retrained and saved successfully!


Parameters: { "use_label_encoder" } are not used.

