In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB 
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')



df = pd.read_csv('./data/original_csv.csv')

# fill missing values with mean column values
df.fillna(df.mean(), inplace=True)

#replace value in class column
df['class'] = df['class'].replace(['class1', 'class2'], [0, 1])
#df.head()

X = df.drop('class', axis=1)
y = df['class']
#print(X.head())

#normalize data
X = (X - X.mean()) / X.std()
y = y

#replace NaN with 0
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.head())

      Single Epithelial Cell Size  Bare Nuclei  Bland Chromatin  \
82                      -0.549168            0        -0.179534   
51                      -0.549168            0        -0.179534   
220                     -0.549168            0        -0.179534   
559                     -0.549168            0        -0.589645   
544                     -0.549168            0        -0.589645   

     Clump Thickness  Marginal Adhesion   Mitoses  Normal Nucleoli  \
82          0.206788          -0.632794 -0.343666        -0.611387   
51          0.206788           0.417854 -0.343666         0.371049   
220        -1.213798          -0.282578 -0.343666        -0.611387   
559         0.206788          -0.632794 -0.343666        -0.611387   
544        -0.858651          -0.282578 -0.343666        -0.611387   

     Uniformity of Cell Shape  Uniformity of Cell Size  
82                  -0.742767                -0.371782  
51                  -0.069800                -0.044070  
220  

In [17]:
#Initialize classifier
clf0 = KNeighborsClassifier(n_neighbors=5, 
                            weights='uniform', 
                            algorithm='auto', 
                            leaf_size=30, 
                            p=2, 
                            metric='minkowski', 
                            metric_params=None, 
                            n_jobs=None)
clf1 = LogisticRegression(random_state=0, 
                            solver='lbfgs', 
                            multi_class='multinomial', 
                            max_iter=1000)
clf2 = DecisionTreeClassifier(random_state=0, 
                                criterion='gini', 
                                splitter='best', 
                                max_depth=None, 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, 
                                max_features=None, 
                                max_leaf_nodes=None, 
                                min_impurity_decrease=0.0,
                                class_weight=None)
clf3 = RandomForestClassifier(n_estimators=100,
                                criterion='gini', 
                                max_depth=None, 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                                min_weight_fraction_leaf=0.0, 
                                max_features='auto', 
                                max_leaf_nodes=None, 
                                bootstrap=True, 
                                oob_score=False, 
                                n_jobs=None, 
                                random_state=None, 
                                verbose=0, 
                                warm_start=False, 
                                class_weight=None)
clf4 = SVC(gamma='auto',
            kernel='rbf', 
            C=1.0, 
            degree=3, 
            coef0=0.0, 
            shrinking=True, 
            probability=False,
            tol=0.001,
            cache_size=200,
            class_weight=None,
            verbose=False,
            max_iter=-1,
            decision_function_shape='ovr',
            break_ties=False,
            random_state=None)
clf5 = GaussianNB()
clf6 = MLPClassifier(hidden_layer_sizes=(100, ),
                    activation='relu',
                    solver='adam',
                    alpha=0.0001,
                    batch_size='auto',
                    learning_rate='constant',
                    learning_rate_init=0.001,
                    power_t=0.5,
                    max_iter=200,
                    shuffle=True,
                    random_state=None,
                    tol=0.0001,
                    verbose=False,
                    warm_start=False,
                    momentum=0.9,
                    nesterovs_momentum=True,
                    early_stopping=False,
                    validation_fraction=0.1,
                    beta_1=0.9,
                    beta_2=0.999,
                    epsilon=1e-08,
                    n_iter_no_change=10)
clf7 = GaussianNB()

models = [clf0, clf1, clf2, clf3, clf4, clf5, clf6, clf7]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(model)
    print("accuracy score: ", accuracy)
    print("f1 score: ", f1)
    

KNeighborsClassifier()
accuracy score:  0.9714285714285714
f1 score:  0.9715871547508921
LogisticRegression(max_iter=1000, multi_class='multinomial', random_state=0)
accuracy score:  0.9571428571428572
f1 score:  0.9568790584415585
DecisionTreeClassifier(random_state=0)
accuracy score:  0.9285714285714286
f1 score:  0.9285714285714286
RandomForestClassifier(max_features='auto')
accuracy score:  0.9642857142857143
f1 score:  0.9643874643874644
SVC(gamma='auto')
accuracy score:  0.9642857142857143
f1 score:  0.9643874643874644
GaussianNB()
accuracy score:  0.9642857142857143
f1 score:  0.9643874643874644
MLPClassifier()
accuracy score:  0.9714285714285714
f1 score:  0.9714285714285714
GaussianNB()
accuracy score:  0.9642857142857143
f1 score:  0.9643874643874644
