In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from collections import Counter
from imblearn.over_sampling import SMOTE

# 1. Load dataset
df = pd.read_csv("data.csv")

# 2. Clean column names
df.columns = df.columns.str.strip()

# 3. Separate features and target
X = df.drop(columns=['num'])
y = df['num'].astype(int)  # ensure numeric target

# 4. Handle missing/non-numeric values
X = X.replace('?', pd.NA)               # replace '?' with NaN
X = X.apply(pd.to_numeric)              # convert to numeric
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# 5. Scale features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# 6. Check class distribution
print("Class distribution before SMOTE:", Counter(y))

# 7. Apply SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)  # <-- fixed here
print("Class distribution after SMOTE:", Counter(y_res))

# 8. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Class distribution before SMOTE: Counter({0: 188, 1: 106})
Class distribution after SMOTE: Counter({0: 188, 1: 188})
Train shape: (300, 13)
Test shape: (76, 13)
