# Importation du dataset & création de train et test

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# https://datagy.io/pandas-shuffle-dataframe/#:~:text=One%20of%20the%20easiest%20ways,Dataframe%2C%20in%20a%20random%20order.
df = pd.read_csv("../diabetes.csv").sample(
    frac = 1,
    random_state=1
).reset_index()

y = df['Outcome']
X = df.drop('Outcome', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

X_train:pd.DataFrame = X_train.to_numpy()
X_test:pd.DataFrame = X_test.to_numpy()
y_train:pd.DataFrame = y_train.to_numpy()
y_test:pd.DataFrame = y_test.to_numpy()

# Fonction pour la classification en fonction de plusieurs algorithmes

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier

from numpy import ndarray

def calculate_classifier_accuracy(
    X_train:ndarray,
    X_test:ndarray,
    y_train:ndarray,
    y_test:ndarray,
    classifier:KNeighborsClassifier|DecisionTreeClassifier|GaussianNB|Perceptron|MLPClassifier|KMeans|HierarchicalClassifier
):
    classifier.fit(X_train,y_train)

    predicted = classifier.predict(X_test)

    accuracy = [True if predicted[i] == y_test[i] else False for i in range(len(predicted))]
    accuracy_stats = {
        "right": len([i for i in accuracy if i]),
        "wrong": len([i for i in accuracy if not i])
    }
    accuracy_stats["percentage"] = round((accuracy_stats["right"]/len(accuracy))*100,2)

    return accuracy_stats
    

# K plus proches voisins

In [4]:
from sklearn.neighbors import KNeighborsClassifier

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, KNeighborsClassifier(n_neighbors=5,  metric='euclidean'))

{'right': 102, 'wrong': 52, 'percentage': 66.23}

# Arbre de decision

In [5]:
from sklearn.tree import DecisionTreeClassifier

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, DecisionTreeClassifier())

{'right': 113, 'wrong': 41, 'percentage': 73.38}

# Classification naïve bayésienne

In [6]:
from sklearn.naive_bayes import GaussianNB

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, GaussianNB())

{'right': 111, 'wrong': 43, 'percentage': 72.08}

# Perceptron

In [7]:
from sklearn.linear_model import Perceptron

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, Perceptron())

{'right': 54, 'wrong': 100, 'percentage': 35.06}

# Adaline

Pour cet algorithme, n'ayant pas trouvé d'équivalent sur sklearn, nous utilisons le code de M. Ajitesh Kumar, trouvable sur le site VitalFlux.com, à l'adresse : https://vitalflux.com/adaline-explained-with-python-example/

In [8]:
import numpy as np

class CustomAdaline:
    # Source : https://vitalflux.com/adaline-explained-with-python-example/
     
    def __init__(self, n_iterations=100, random_state=1, learning_rate=0.01):
        self.n_iterations = n_iterations
        self.random_state = random_state
        self.learning_rate = learning_rate
 
    '''
    Batch Gradient Descent
     
    1. Weights are updated considering all training examples.
    2. Learning of weights can continue for multiple iterations
    3. Learning rate needs to be defined
    '''
    def fit(self, X, y):
        rgen = np.random.RandomState(self.random_state)
        self.coef_ = rgen.normal(loc=0.0, scale=0.01, size=1 + X.shape[1])
        for _ in range(self.n_iterations):
              activation_function_output = self.activation_function(self.net_input(X))
              errors = y - activation_function_output
              self.coef_[1:] = self.coef_[1:] + self.learning_rate*X.T.dot(errors)
              self.coef_[0] = self.coef_[0] + self.learning_rate*errors.sum()
     
    '''
    Net Input is sum of weighted input signals
    '''
    def net_input(self, X):
            weighted_sum = np.dot(X, self.coef_[1:]) + self.coef_[0]
            return weighted_sum
     
    '''
    Activation function is fed the net input. As the activation function is
    an identity function, the output from activation function is same as the
    input to the function.
    '''
    def activation_function(self, X):
            return X
     
    '''
    Prediction is made on the basis of output of activation function
    '''
    def predict(self, X):
        return np.where(self.activation_function(self.net_input(X)) >= 0.0, 1, 0)
     
    '''
    Model score is calculated based on comparison of
    expected value and predicted value
    '''
    def score(self, X, y):
        misclassified_data_count = 0
        for xi, target in zip(X, y):
            output = self.predict(xi)
            if(target != output):
                misclassified_data_count += 1
        total_data_count = len(X)
        self.score_ = (total_data_count - misclassified_data_count)/total_data_count
        return self.score_

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, CustomAdaline())

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


{'right': 101, 'wrong': 53, 'percentage': 65.58}

# Réseau de neurones

In [9]:
from sklearn.neural_network import MLPClassifier

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, MLPClassifier(solver='lbfgs', alpha=1e-5 ,hidden_layer_sizes=(5, 2), random_state=1))

{'right': 101, 'wrong': 53, 'percentage': 65.58}

# K-means

In [10]:
from sklearn.cluster import KMeans

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, KMeans(n_clusters=1))

{'right': 101, 'wrong': 53, 'percentage': 65.58}

# Classification hiérarchique

In [11]:
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier

calculate_classifier_accuracy(X_train, X_test, y_train, y_test, HierarchicalClassifier())

{'right': 117, 'wrong': 37, 'percentage': 75.97}

# Dbscan

In [12]:
from sklearn.cluster import DBSCAN
import numpy as np
import scipy as sp

class DBSCAN_with_predict(DBSCAN):

    def predict(self, X_new, metric=sp.spatial.distance.cosine):

        # Result is noise by default
        y_new = np.ones(shape=len(X_new), dtype=int)*-1

        # Iterate all input samples for a label
        for j, x_new in enumerate(X_new):
            # Find a core sample closer than EPS
            for i, x_core in enumerate(self.components_):
                if metric(x_new, x_core) < self.eps:
                    # Assign label of x_core to x_new
                    y_new[j] = self.labels_[self.core_sample_indices_[i]]
                    print(j,x_new,i,x_core,y_new)
                    break

        return y_new


calculate_classifier_accuracy(X_train, X_test, y_train, y_test, DBSCAN_with_predict())

{'right': 0, 'wrong': 154, 'percentage': 0.0}