# Datapreprocessing

### simpleImputer

In [53]:
class SimpleImputer:
    def __init__(self, strategy='mean', missing_values=np.nan):
        self.strategy = strategy
        self.missing_values = missing_values

    def fit(self, X):
        if self.strategy == 'mean':
            self.fill_value = np.nanmean(X, axis=0)
        elif self.strategy == 'median':
            self.fill_value = np.nanmedian(X, axis=0)
        elif self.strategy == 'most_frequent':
            self.fill_value = np.empty(X.shape[1])
            for i in range(X.shape[1]):
                column = X[:, i]
                values, counts = np.unique(column[~np.isnan(column)], return_counts=True)
                self.fill_value[i] = values[np.argmax(counts)]
        else:
            raise ValueError('Invalid strategy: {}'.format(self.strategy))

    def transform(self, X):
        if self.strategy == 'most_frequent':
            X = np.copy(X)
            for i in range(X.shape[1]):
                column = X[:, i]
                mask = np.isnan(column)
                if mask.any():
                    column[mask] = self.fill_value[i]
        else:
            X = np.where(np.isnan(X), self.fill_value, X)

        return X

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

### train_test_split

In [54]:
def train_test_split(X, y, test_size=0.2, random_state=42):
    if random_state:
        np.random.seed(random_state)
    shuffle_indices = np.random.permutation(len(X))
    test_size = int(test_size * len(X))
    test_indices = shuffle_indices[:test_size]
    train_indices = shuffle_indices[test_size:]
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
    return X_train, X_test, y_train, y_test


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Defining the node class

In [56]:
class DecisionTreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

### Defining the DecisionTreeClaasifier class

In [57]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=1, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape

        if n_samples < self.min_samples_split or depth == self.max_depth:
            return self._leaf_value(y)

        best_feature_index, best_threshold = self._best_split(X, y)

        if best_feature_index is None:
            return self._leaf_value(y)

        left_idxs = X[:, best_feature_index] <= best_threshold
        right_idxs = X[:, best_feature_index] > best_threshold

        left_tree = self._build_tree(X[left_idxs], y[left_idxs], depth + 1)
        right_tree = self._build_tree(X[right_idxs], y[right_idxs], depth + 1)

        return DecisionTreeNode(best_feature_index, best_threshold, left_tree, right_tree)

    def _best_split(self, X, y):
        best_gini = np.inf
        best_feature_index = None
        best_threshold = None

        for feature_index in range(X.shape[1]):
            feature = X[:, feature_index]
            thresholds = np.unique(feature)

            for threshold in thresholds:
                left_idxs = feature <= threshold
                right_idxs = feature > threshold

                if len(y[left_idxs]) == 0 or len(y[right_idxs]) == 0:
                    continue

                gini = self._split_gini(y, y[left_idxs], y[right_idxs])

                if gini < best_gini:
                    best_gini = gini
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _split_gini(self, parent, left, right):
        left_weight = len(left) / len(parent)
        right_weight = len(right) / len(parent)
        return left_weight * self._gini(left) + right_weight * self._gini(right)

    def _gini(self, y):
        class_counts = np.bincount(y)
        class_probs = class_counts / len(y)
        return 1 - sum(class_probs ** 2)

    def _leaf_value(self, y):
        value = np.bincount(y).argmax()
        return DecisionTreeNode(value=value)

    def predict(self, X):
        predictions = np.zeros(X.shape[0])
        for i, x in enumerate(X):
            node = self.tree
            while node.left:
                if x[node.feature_index] <= node.threshold:
                    node = node.left
                else:
                    node = node.right
                    
            predictions[i] = node.value
        return predictions


### Accuracy_score

In [58]:
def accuracy_score(y_true, y_pred):
    n_correct = sum(y_true[i] == y_pred[i] for i in range(len(y_true)))
    n_total = len(y_true)
    return n_correct / n_total

# pipeline

In [59]:
# Define pipeline for KNN classification
class Pipeline:
    def __init__(self, steps):
        self.steps = steps
        
    def fit(self, X_train, y_train):
        for step in self.steps:
            step[1].fit(X_train, y_train)
    
    def predict(self, X_test):
        X_data = X_test
        for step in self.steps:
            X_data = step[1].predict(X_data)
        return X_data


In [60]:
import numpy as np
import pandas as pd

# read the data
df = pd.read_csv("data.csv")

#replacing male with 1 and female with 0
df.replace({'sex':{'male':1,'female':0}}, inplace=True)

# split into features and target
X = df.drop("target", axis=1)
y = df["target"]
X = np.array(X)
y = np.array(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create the preprocessing pipeline
preprocessor = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
])

# create the decision tree classifier
clf = DecisionTreeClassifier()

# create the pipeline with preprocessing and classification
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", clf)
])

# Fit pipeline to training data
pipe.fit(X_train, y_train)

# Define hyperparameter ranges
max_depths = [1,2,3,4,5,6,7,8,9,10]
min_samples_splits = [1,2,3,4,5,6,7,8,9,10]

# Split data into training and validation sets
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize variables to store best hyperparameters and best validation score
best_params = None
best_score = -np.inf

# Loop over all possible combinations of hyperparameters
for max_depth in max_depths:
    for min_samples_split in min_samples_splits:
        # Fit decision tree model on training data using current hyperparameters
        tree = DecisionTreeClassifier(max_depth, min_samples_split)
        tree.fit(X_train, y_train)
               
        # Evaluate model on validation set
        y_pred = tree.predict(X_val)
        score = accuracy_score(y_val, y_pred)
               
        # Record best hyperparameters and best validation score
        if score > best_score:
            best_params = {'max_depth': max_depth, 'min_samples_split': min_samples_split}
            best_score = score

# Fit decision tree model on entire dataset using best hyperparameters
best_tree = DecisionTreeClassifier(**best_params)
best_tree.fit(X, y)

# Evaluate model on separate test set
y_pred_train = best_tree.predict(X_train)
train_score = accuracy_score(y_train, y_pred_train)
print(f'Train set accuracy: {train_score}')

y_pred_test = best_tree.predict(X_test)
test_score = accuracy_score(y_test, y_pred_test)
print(f'Test set accuracy: {test_score}')

print("Best max_depth: ", best_params['max_depth'])
print("Best min_samples_split: ", best_params['min_samples_split'])

Train set accuracy: 0.831275720164609
Test set accuracy: 0.9
Best max_depth:  3
Best min_samples_split:  1
