Import libraries

In [84]:
import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (make_scorer, accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report)


Extract the data

In [85]:
df = pd.read_csv("../data/raw/survey.csv")

df.shape

(1259, 27)

In [86]:
target_col = "treatment"  # <-- your binary target (Yes/No)
gender_col = "Gender"     # <-- gender column name

# Make target numeric
df[target_col] = df[target_col].map({"Yes": 1, "No": 0}).astype(int)

X = df.drop(columns=[target_col])
y = df[target_col]

**Feature engineering**

separate target and features + drop unused features

In [87]:
X = df.drop(['Timestamp','Country','state','comments','treatment'], axis=1)
y = df['treatment']

divide features into categorical and numerical

In [88]:
object_features = X.select_dtypes(include=["object"])
numeric_features = X.select_dtypes(include=["int64", "float64"])

create a regex to clean the gender feature

In [89]:
def gender_simplify(gen):

    s = str(gen).strip().lower()
    s = re.sub(r"[\W_]+", " ", s).strip()

    if s in {"m", "male", "man","make", "mal","malr","msle","masc","mail", "boy"}:
        return "Male"
    if s in {"f", "female", "woman","femake", "femail","femme", "girl"}:
        return "Female"
    if s in {"nb", "n", "nonbinary", "non binary"}:
        return "Other"

    # Detect more complex patterns
    if re.search(r"\b(female|woman|girl|femme)\b", s):
        return "Female"
    if re.search(r"\b(male|man|boy|masc)\b", s):
        return "Male"

    # Otherwise, everything else -> Other
    return "Other"


def simplify_gender_df(X):
    # If it's already a DataFrame, take the FIRST (and only) column
    if isinstance(X, pd.DataFrame):
        col = X.columns[1]                # <-- was [1]; must be [0]
        series = X[col]
    else:
        # Coerce to ndarray and normalize shapes
        X = np.asarray(X, dtype=object)
        if X.ndim == 1:
            series = pd.Series(X)
        elif X.ndim == 2 and X.shape[1] == 1:
            series = pd.Series(X[:, 0])
        else:
            raise ValueError(f"Expected a single column, got shape {X.shape}")

    # Apply your regex mapper and return a ONE-COLUMN DataFrame
    out = series.apply(gender_simplify)
    return out.to_frame(name=getattr(series, "name", "gender"))

apply regex to gender

In [90]:
numeric_features = X.select_dtypes(include=["number", "bool"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
other_categoricals = [c for c in categorical_features if c != 'Gender']

Define preprocessing pipelines and models to evaluate

In [91]:
gender_pipeline = Pipeline([
    ("impute",   SimpleImputer(strategy="most_frequent")),
    ("simplify", FunctionTransformer(simplify_gender_df, feature_names_out="one-to-one",validate=False)),
    ("ohe",      OneHotEncoder(handle_unknown="ignore")),
])

cat_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe",    OneHotEncoder(handle_unknown="ignore")),
])

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
])


preprocess = ColumnTransformer([
    ("num",    num_pipeline, numeric_features),
    ("cat",    cat_pipeline, other_categoricals),
    ("gender", gender_pipeline, ['Gender']),
])

# ----------------------------
# Models
# ----------------------------
log_reg = LogisticRegression(max_iter=2000)
tree_clf = DecisionTreeClassifier(random_state=42)
xgb_clf = XGBClassifier(
    n_estimators=500,   
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

models = {
    "Logistic Regression": log_reg,
    "Decision Tree": tree_clf,
    "XGBoost": xgb_clf
}

In [92]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "accuracy":  make_scorer(accuracy_score),
    "precision": make_scorer(precision_score, zero_division=0),
    "recall":    make_scorer(recall_score,    zero_division=0),
    "f1":        make_scorer(f1_score,        zero_division=0)
}

for name, est in models.items():
    pipe = Pipeline([("preprocess", preprocess), ("model", est)])
    res = cross_validate(pipe, X, y, cv=cv, scoring=scoring)
    print(f"\n=== {name} (5-fold CV) ===")
    print(f"Accuracy : {res['test_accuracy'].mean():.3f} ± {res['test_accuracy'].std():.3f}")
    print(f"Precision: {res['test_precision'].mean():.3f} ± {res['test_precision'].std():.3f}")
    print(f"Recall   : {res['test_recall'].mean():.3f} ± {res['test_recall'].std():.3f}")
    print(f"F1       : {res['test_f1'].mean():.3f} ± {res['test_f1'].std():.3f}")


ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
ABNORMAL: 

You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== Logistic Regression (5-fold CV) ===
Accuracy : 0.732 ± 0.047
Precision: 0.753 ± 0.034
Recall   : 0.697 ± 0.091
F1       : 0.722 ± 0.064

=== Decision Tree (5-fold CV) ===
Accuracy : 0.658 ± 0.045
Precision: 0.662 ± 0.041
Recall   : 0.661 ± 0.059
F1       : 0.661 ± 0.049

=== XGBoost (5-fold CV) ===
Accuracy : 0.728 ± 0.026
Precision: 0.728 ± 0.025
Recall   : 0.739 ± 0.037
F1       : 0.733 ± 0.027


In [94]:
final_model = models["XGBoost"]
final_pipe  = Pipeline([("preprocess", preprocess), ("model", final_model)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

final_pipe.fit(X_train, y_train)
y_pred = final_pipe.predict(X_test)

print("\n=== Test Set Metrics (XGB) ===")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred, zero_division=0))
print("F1       :", f1_score(y_test, y_pred, zero_division=0))

# Confusion matrix + per-class report
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, zero_division=0))


=== Test Set Metrics (XGB) ===
Accuracy : 0.6865079365079365
Precision: 0.7168141592920354
Recall   : 0.6328125
F1       : 0.6721991701244814

Confusion matrix:
 [[92 32]
 [47 81]]

Classification report:
               precision    recall  f1-score   support

           0       0.66      0.74      0.70       124
           1       0.72      0.63      0.67       128

    accuracy                           0.69       252
   macro avg       0.69      0.69      0.69       252
weighted avg       0.69      0.69      0.69       252

