## CIS 335 Assignment 2

Jensen Holm <br>
Feb. 26th 2023

In [28]:
import pandas as pd
from dataclasses import dataclass
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

### Cleaning

In [29]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train = train.drop("Unnamed: 32", axis=1)
test = test.drop("Unnamed: 32", axis=1)

train["diagnosis"] = train["diagnosis"].dropna()
test["diagnosis"]= train["diagnosis"].dropna()

X_train = train[[col for col in train.columns if col != "diagnosis"]]
Y_train = train["diagnosis"]

X_test = test[[col for col in train.columns if col != "diagnosis"]]
Y_test = test["diagnosis"]

## Model Expiramentation

In [65]:

@dataclass
class ClassifierTree:
    X_train: pd.DataFrame
    Y_train: pd.DataFrame
    X_test: pd.DataFrame
    Y_test: pd.DataFrame

    @property
    def x_train(self) -> pd.DataFrame:
        return self.X_train

    @property
    def y_train(self) -> pd.DataFrame:
        return self.Y_train
    
    @property
    def x_test(self) -> pd.DataFrame:
        return self.X_test
    
    @property
    def y_test(self) -> pd.DataFrame:
        return self.Y_test

    @staticmethod
    def norms(norm_type, depth=5, splitter="best"):
        n = {
            "z-score": make_pipeline(
                StandardScaler(),
                DecisionTreeClassifier(max_depth=depth, splitter=splitter),
            ),
            "minmax": make_pipeline(
                MinMaxScaler(),
                DecisionTreeClassifier(max_depth=depth, splitter=splitter)
            )
        }
        try:
            return n[norm_type]
        except KeyError:
            raise KeyError(f"no such normalization type of '{norm_type}'")


    def fit(self, norm_type=None, depth=5, splitter="best"):
        if norm_type:
            dt = self.norms(norm_type=norm_type, depth=depth, splitter=splitter)
        else:
            dt = DecisionTreeClassifier(max_depth=depth, splitter=splitter)

        dt.fit(self.x_train, self.y_train)
        pred = dt.predict(self.x_test)
        score = accuracy_score(
            y_pred=pred,
            y_true=self.y_test,
        )

        return f"depth = {depth}, splitter = {splitter}, normalizaion = {norm_type}, Accuracy: {score*100:.3f}"


### Expiramentation

In [66]:
tree = ClassifierTree(
    X_train=X_train,
    Y_train=Y_train,
    X_test=X_test,
    Y_test=Y_test,
)

In [76]:
# no pre processing
no_5 = tree.fit(depth=5)
no_6 = tree.fit(depth=5, splitter="random")
print(no_5)
print(no_6)

# z score normalization
z_5 = tree.fit(depth=50, norm_type="z-score")
z_5_r = tree.fit(norm_type="z-score", depth=20)
print(z_5)
print(z_5_r)

# minmax normalization
m_5 = tree.fit(depth=20, norm_type="minmax")
print(m_5)

depth = 5, splitter = best, normalizaion = None, Accuracy: 39.167
depth = 5, splitter = random, normalizaion = None, Accuracy: 41.667
depth = 50, splitter = best, normalizaion = z-score, Accuracy: 40.000
depth = 20, splitter = best, normalizaion = z-score, Accuracy: 39.167
depth = 20, splitter = best, normalizaion = minmax, Accuracy: 41.667
