In [10]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

# ================= Step 1: Load Data =================
csv_path = r"C:\Users\basze\desktop\student-dropout-prediction\data\raw\School_dropout_Dataset.csv"
df = pd.read_csv(csv_path)

# ================= Step 2: Clean Target =================
df = df.dropna(subset=["Dropped_out"])
df = df[df["Dropped_out"].isin([0, 1])]
if df.shape[0] == 0:
    raise ValueError("❌ No valid rows left in dataset after cleaning. Check your CSV file.")
y = df["Dropped_out"]

# ================= Step 3: Convert 'Year_of_Study' from range string to numeric =================
def convert_range_to_numeric(value):
    if isinstance(value, str) and '-' in value:
        try:
            # Take the lower bound of the range as float
            return float(value.split('-')[0].strip())
        except Exception:
            return float('nan')
    else:
        try:
            return float(value)
        except Exception:
            return float('nan')

df['Year_of_Study'] = df['Year_of_Study'].apply(convert_range_to_numeric)
df = df.dropna(subset=['Year_of_Study'])

# ================= Step 4: Encode Remaining Categorical Columns =================
categorical_cols = ['Child_Gender', 'Performance', 'Social_activity']
le_dict = {}

for col in categorical_cols:
    le = LabelEncoder()
    df.loc[:, col] = le.fit_transform(df[col])
    le_dict[col] = le

# ================= Step 5: Prepare Features =================
X = df.drop("Dropped_out", axis=1)

# Ensure features are float type for scaling
X = X.astype(float)

# ================= Step 6: Split Data =================
test_size = 0.2 if len(df) > 10 else 0.1
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y
)

# Check for NaNs after split
if X_train.isnull().sum().sum() > 0 or X_test.isnull().sum().sum() > 0:
    raise ValueError("❌ NaNs detected in features after splitting!")

# ================= Step 7: Scale Features =================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ================= Step 8: Ensure directories =================
os.makedirs("data/processed", exist_ok=True)
os.makedirs("models", exist_ok=True)

# ================= Step 9: Save processed data =================
pd.DataFrame(X_train_scaled, columns=X_train.columns).to_csv("data/processed/X_train.csv", index=False)
pd.DataFrame(X_test_scaled, columns=X_test.columns).to_csv("data/processed/X_test.csv", index=False)
pd.DataFrame(y_train, columns=["Dropped_out"]).to_csv("data/processed/y_train.csv", index=False)
pd.DataFrame(y_test, columns=["Dropped_out"]).to_csv("data/processed/y_test.csv", index=False)

# ================= Step 10: Save scaler and encoders =================
joblib.dump(scaler, "models/scaler.pkl")
for col, le in le_dict.items():
    joblib.dump(le, f"models/le_{col}.pkl")

print("✅ Preprocessing complete! Data saved in data/processed/, scaler and encoders saved in models/")


ValueError: Found input variables with inconsistent numbers of samples: [0, 301]