In [124]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [125]:
# Statistics for Analysis

# Confusion Matrix
def confusionMatrix(pred, test):
    mat = sklearn.metrics.confusion_matrix(pred, test)
    sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
    plt.xlabel('true label')
    plt.ylabel('predicted label');

# Accuracy
def accuracy(pred, test):
    print('Accuracy: %.2f' %sklearn.metrics.accuracy_score(test, pred))
    
# Mean of test set
def mean(test):
    print('Mean: %.2f' %np.mean(test))
    
# Mean Root Squared
def mrs(pred, test):
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test, pred)))

# Accuracy + Confusion Matrix + Mean + MRS
def combined(pred, test):
    accuracy(pred,test)
    confusionMatrix(pred, test)
    mrs(pred, test) 

In [126]:
# Load the dataset. 
df = pd.read_csv("./Data/heart.csv", na_values=['NA', '?'])
df = df.reindex(np.random.permutation(df.index))

# Data Wrangling 
df.target.replace({1: 0, 0: 1})

result = []

for x in df.columns:
    if x != 'target':
        result.append(x)
        
X = df[result].values
y = df['target'].values

In [131]:
# Kfold training + results 
class crossValidation:
    
    def __init__(self, algorithm, X, y):
        
        self.algorithm = algorithm
        self.X = X 
        self.y = y
        
    # Splits and trains the dataset
    def train(self, k):
        self.k = k
        
        #Scale data - normalize data
        sc = StandardScaler()
        # Decides number of splits according to specified number by user
        kf = KFold(self.k)
        output = []
        
        for train_index, validate_index in kf.split(self.X,self.y):
            # Training data
            Xt = self.X[train_index]
            sc.fit(Xt)
            X_train_std = sc.transform(Xt)
            X_test_std = sc.transform(self.X[validate_index])
            # Fit model to data
            self.algorithm.fit(X_train_std, y[train_index])
            # Get validation labels
            y_test = (self.y[validate_index])
            # Get predicted labels
            y_pred = (self.algorithm.predict(X_test_std))
            # Creating a list of predictions and validation labels for later use
            output.append(y_pred)
            output.append(y_test)
            
        return output
    
    #Performs statistics on the model
    def stats(self, k):
        pred = self.train(k)
        # See 'combined' function
        return combined(pred[0], pred[1])
    
    #Evalates the effect of different fold sizes and gives the mean of all folds.
    def evaluateK(self, k):
        result = []
        # Runs from 2 (minimum available amount) to a specified number (fold sizes) by user
        for i in range(2, k):
            #Calls train method
            pred = self.train(i)
            result.append(accuracy_score(pred[0], pred[1]))
        
        return result

In [132]:
class handler:
    def __init__(self, data_frame, file_name, ml_type, target):
        self.data_frame = data_frame
        self.ml_type = ml_type
        self.target = target
        self.file_name = file_name
    def data_wrangler(self):
        if self.ml_type = 'classification':
            result = []

            for x in self.data_frame.columns:
                if x != self.target:
                    result.append(x)
                
            self.X = self.data_frame[result].values
            self.y = self.data_frame[self.target].values
            del self.data_frame
            del self.ml_type
        elif self.my_type = 'regression':
            del self.data_frame
            del self.ml_type
    def analysis(self):
        
        output = []
        model_list = [DecisionTreeClassifier(criterion = 'entropy'), Perceptron(max_iter=40,tol=0.001,eta0=1),
                     KNeighborsClassifier(), RandomForestClassifier(n_estimators=10,criterion="entropy")]
        for model in model_list:
            
            name = str(model).split('(')
            name_dict = {}
            model_output = []
            results = crossValidation(model, self.X, self.y)
            np.max(results.evaluateK(20))
            name_dict[name[0]] = np.max(results.evaluateK(20))
            output.append(name_dict)
            
        
        return output

In [133]:
data_handler = handler(10, 20, 30, 40)

In [134]:
results = data_handler.analysis()

In [135]:
print(results)

[{'DecisionTreeClassifier': 0.9411764705882353}, {'Perceptron': 0.8571428571428571}, {'KNeighborsClassifier': 0.8289473684210527}, {'RandomForestClassifier': 0.8823529411764706}]
