# keep only 105031 rows from the dataset for model training

In [49]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


df = pd.read_excel("Mental Health DataSet.xlsx")
df.iloc[:105031].to_excel("cleaned_dataset.xlsx", index=False)

# remove NaN values and Encoding categorical features: Convert Gender, Country, Occupation, and Days_Indoors into numeric representations

In [50]:
df = pd.read_excel("cleaned_dataset.xlsx")

# Remove rows with any NaN values
df = df.dropna()

le = LabelEncoder()

df['Gender']=le.fit_transform(df['Gender'])
df['Country']=le.fit_transform(df['Country'])
df['Occupation']=le.fit_transform(df['Occupation'])
df['Days_Indoors']=le.fit_transform(df['Days_Indoors'])


df.dropna().to_excel("cleaned_dataset.xlsx", index=False)

# Now Train/test split to prepare for model implementation.

In [51]:
from sklearn.model_selection import train_test_split


X = df.drop('treatment', axis=1)
y = df['treatment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Display the results
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape:  {y_test.shape}")

X_train shape: (82297, 15)
X_test shape:  (20575, 15)
y_train shape: (82297,)
y_test shape:  (20575,)


# Decision Tree code

In [52]:
import numpy as np
from sklearn.metrics import confusion_matrix

class DecisionTreeClassifierScratch:
    """
    A simple Decision Tree classifier implemented from scratch.
    Supports 'entropy' or 'gini' as split criteria.
    """
    class Node:
        def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
            self.feature = feature
            self.threshold = threshold
            self.left = left
            self.right = right
            self.value = value

    def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy'):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def _entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return -np.sum(probs * np.log2(probs + 1e-9))

    def _gini(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probs = counts / counts.sum()
        return 1 - np.sum(probs**2)

    def _information_gain(self, y, y_left, y_right):
        if self.criterion == 'gini':
            loss = self._gini
        else:
            loss = self._entropy
        parent_loss = loss(y)
        n = len(y)
        n_left, n_right = len(y_left), len(y_right)
        child_loss = (n_left/n)*loss(y_left) + (n_right/n)*loss(y_right)
        return parent_loss - child_loss

    def _best_split(self, X, y):
        best_gain = 0
        best_feat, best_thresh = None, None
        n_samples, n_features = X.shape

        for feature_idx in range(n_features):
            thresholds = np.unique(X[:, feature_idx])
            for t in thresholds:
                left_mask = X[:, feature_idx] <= t
                right_mask = ~left_mask
                if left_mask.sum() < self.min_samples_split or right_mask.sum() < self.min_samples_split:
                    continue
                gain = self._information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain, best_feat, best_thresh = gain, feature_idx, t

        return best_feat, best_thresh

    def _most_common_label(self, y):
        classes, counts = np.unique(y, return_counts=True)
        return classes[np.argmax(counts)]

    def _build_tree(self, X, y, depth=0):
        # Stopping conditions
        if (self.max_depth is not None and depth >= self.max_depth) \
           or len(np.unique(y)) == 1 \
           or len(y) < self.min_samples_split:
            leaf_value = self._most_common_label(y)
            return DecisionTreeClassifierScratch.Node(value=leaf_value)

        feat, thresh = self._best_split(X, y)
        if feat is None:
            return DecisionTreeClassifierScratch.Node(value=self._most_common_label(y))

        left_mask = X[:, feat] <= thresh
        left = self._build_tree(X[left_mask], y[left_mask], depth+1)
        right = self._build_tree(X[~left_mask], y[~left_mask], depth+1)
        return DecisionTreeClassifierScratch.Node(feature=feat, threshold=thresh, left=left, right=right)

    def fit(self, X, y):
        """
        Build the tree using training data.
        X: array-like of shape (n_samples, n_features)
        y: array-like of shape (n_samples,)
        """
        X, y = np.array(X), np.array(y)
        self.tree = self._build_tree(X, y)
        return self

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def predict(self, X):
        """
        Predict class labels for samples in X.
        X: array-like of shape (n_samples, n_features)
        """
        X = np.array(X)
        return np.array([self._traverse_tree(x, self.tree) for x in X])



# Metric calculations

In [53]:
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

# --- Train scratch Decision Tree ---
tree = DecisionTreeClassifierScratch(max_depth=5, min_samples_split=10, criterion='entropy')
tree.fit(X_train.values, y_train.values)

# --- Predictions and confusion matrix ---
y_pred = tree.predict(X_test.values)
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()


# --- Compute metrics ---
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# --- Output results ---
print("Confusion Matrix:")
print(pd.DataFrame(cm, index=['Actual 0 (No Treatment)', 'Actual 1 (Treatment)'], columns=['Predicted 0', 'Predicted 1']))
print(f"\nAccuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")


Confusion Matrix:
                         Predicted 0  Predicted 1
Actual 0 (No Treatment)         5850         2896
Actual 1 (Treatment)            2457         9372

Accuracy:  0.7398
Precision: 0.7639
Recall:    0.7923
F1-Score:  0.7779


# Random Forest

In [54]:
from collections import Counter

class DecisionTreeClassifierRF(DecisionTreeClassifierScratch):
    def __init__(self, max_depth=None, min_samples_split=2, criterion='entropy',
                 max_features=None, random_state=None):
        super().__init__(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion)
        self.max_features = max_features
        self.random_state = random_state
        self.rng = np.random.RandomState(random_state)

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        # randomly select a subset of features
        if self.max_features and self.max_features < n_features:
            features = self.rng.choice(n_features, self.max_features, replace=False)
        else:
            features = np.arange(n_features)

        best_gain = 0
        best_feat, best_thresh = None, None
        for feature_idx in features:
            thresholds = np.unique(X[:, feature_idx])
            for t in thresholds:
                left_mask = X[:, feature_idx] <= t
                right_mask = ~left_mask
                if left_mask.sum() < self.min_samples_split or right_mask.sum() < self.min_samples_split:
                    continue
                gain = self._information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain, best_feat, best_thresh = gain, feature_idx, t
        return best_feat, best_thresh

class RandomForestClassifierScratch:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2,
                 criterion='entropy', max_features='sqrt', random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.max_features = max_features
        self.random_state = random_state
        self.trees = []
        self.rng = np.random.RandomState(random_state)

    def _get_max_features(self, n_features):
        if isinstance(self.max_features, int):
            return self.max_features
        if self.max_features == 'sqrt':
            return int(np.sqrt(n_features))
        if self.max_features == 'log2':
            return int(np.log2(n_features))
        return n_features

    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features = X.shape
        self.trees = []
        for i in range(self.n_estimators):
            # Bootstrap sampling
            indices = self.rng.choice(n_samples, n_samples, replace=True)
            X_sample, y_sample = X[indices], y[indices]

            max_feats = self._get_max_features(n_features)
            tree = DecisionTreeClassifierRF(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                criterion=self.criterion,
                max_features=max_feats,
                random_state=(self.random_state + i) if self.random_state is not None else None
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        return self

    def predict(self, X):
        X = np.array(X)
        # Collect predictions from each tree
        all_preds = np.array([tree.predict(X) for tree in self.trees])
        # Majority vote
        y_pred = []
        for preds in all_preds.T:
            vote = Counter(preds).most_common(1)[0][0]
            y_pred.append(vote)
        return np.array(y_pred)

# Example usage:
rf = RandomForestClassifierScratch(n_estimators=20, max_depth=5, min_samples_split=10, criterion='entropy', max_features='sqrt', random_state=42)
rf.fit(X_train.values, y_train.values)
y_pred_rf = rf.predict(X_test.values)

In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# --- Train scratch Random Forest ---
rf = RandomForestClassifierScratch(
    n_estimators=20,
    max_depth=5,
    min_samples_split=5,
    criterion='entropy',
    max_features='sqrt',
    random_state=42
)
rf.fit(X_train.values, y_train.values)

# --- Predictions and confusion matrix ---
y_pred_rf = rf.predict(X_test.values)
cm_rf = confusion_matrix(y_test, y_pred_rf)
tn, fp, fn, tp = cm_rf.ravel()

# --- Compute metrics ---
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# --- Output results ---
print("Random Forest Confusion Matrix:")
print(pd.DataFrame(cm_rf,
                   index=['Actual 0 (No Treatment)', 'Actual 1 (Treatment)'],
                   columns=['Predicted 0', 'Predicted 1']))
print(f"\nAccuracy:    {accuracy_rf:.4f}")
print(f"Precision:   {precision_rf:.4f}")
print(f"Recall:      {recall_rf:.4f}")
print(f"F1-Score:    {f1_rf:.4f}")


Random Forest Confusion Matrix:
                         Predicted 0  Predicted 1
Actual 0 (No Treatment)         5404         3342
Actual 1 (Treatment)            2149         9680

Accuracy:    0.7331
Precision:   0.7434
Recall:      0.8183
F1-Score:    0.7790


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# --- Decision stump as a weak learner ---
class DecisionStump:
    def __init__(self):
        self.feature = None
        self.threshold = None
        self.polarity = 1

    def fit(self, X, y, sample_weights):
        X, y, w = np.array(X), np.array(y), np.array(sample_weights)
        n_samples, n_features = X.shape
        min_error = float('inf')

        # search best feature, threshold, and polarity
        for feature_i in range(n_features):
            thresholds = np.unique(X[:, feature_i])
            for t in thresholds:
                for polarity in [1, -1]:
                    # predictions: +1 by default
                    preds = np.ones(n_samples)
                    if polarity == 1:
                        preds[X[:, feature_i] < t] = -1
                    else:
                        preds[X[:, feature_i] > t] = -1

                    error = np.sum(w[preds != y])
                    if error < min_error:
                        min_error = error
                        self.polarity = polarity
                        self.threshold = t
                        self.feature = feature_i
        return self

    def predict(self, X):
        X = np.array(X)
        n_samples = X.shape[0]
        preds = np.ones(n_samples)
        if self.polarity == 1:
            preds[X[:, self.feature] < self.threshold] = -1
        else:
            preds[X[:, self.feature] > self.threshold] = -1
        return preds

# --- AdaBoost from scratch ---
class AdaBoostScratch:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.learners = []
        self.alphas = []

    def fit(self, X, y):
        X, y = np.array(X), np.array(y)
        # convert y to {-1, +1}
        y_signed = np.where(y == 1, 1, -1)
        n_samples = len(y_signed)
        # initialize sample weights
        w = np.ones(n_samples) / n_samples

        for _ in range(self.n_estimators):
            stump = DecisionStump().fit(X, y_signed, w)
            preds = stump.predict(X)

            # weighted error
            err = np.clip(np.sum(w[preds != y_signed]), 1e-10, 1-1e-10)
            alpha = 0.5 * np.log((1 - err) / err)

            # update weights
            w *= np.exp(-alpha * y_signed * preds)
            w /= w.sum()

            self.learners.append(stump)
            self.alphas.append(alpha)

    def predict(self, X):
        X = np.array(X)
        # weighted sum of stump predictions
        learner_preds = np.array([alpha * learner.predict(X)
                                   for learner, alpha in zip(self.learners, self.alphas)])
        y_signed_pred = np.sign(np.sum(learner_preds, axis=0))
        # map back to {0,1}
        return np.where(y_signed_pred == 1, 1, 0)

ab = AdaBoostScratch(n_estimators=20)
ab.fit(X_train, y_train)                # use .values arrays
y_pred_ab = ab.predict(X_test)

# Evaluate
cm = confusion_matrix(y_test, y_pred_ab)
print("AdaBoost Confusion Matrix:")
print(cm)
print(f"Accuracy:  {accuracy_score(y_test, y_pred_ab):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_ab):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred_ab):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred_ab):.4f}")


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'