# TAF MCE - UE Machine Learning
## Project - Unit Test

Authors: Kévin Ferreira, Emma Bonnem, Elias Tranchant
Year: 2021-2022

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Import project.py
exec(open("./project.py").read())

# 1. Test import_dataset

We start by importing our datasets.

In [3]:
data_banknote_authentication = import_dataset('data_banknote_authentication.txt', ["Variance of Wavelet Transformed image", \
                                                           "Skewness of Wavelet Transformed image", \
                                                           "Curtosis of Wavelet Transformed image", \
                                                           "Entropy of image", "classification"])
data_banknote_authentication

Unnamed: 0,Variance of Wavelet Transformed image,Skewness of Wavelet Transformed image,Curtosis of Wavelet Transformed image,Entropy of image,classification
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [4]:
data_kidney_disease = import_dataset('kidney_disease.csv')
data_kidney_disease

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


For the following parts, we will use the data_kidney_disease dataset.

# 2. replace_missing_values

We replace the missing values and separate the decision data from the classification column.

In [5]:
# used_dataset = data_kidney_disease
used_dataset = data_banknote_authentication

In [6]:
X, y = replace_missing_values(used_dataset)

In [7]:
X

Unnamed: 0,Variance of Wavelet Transformed image,Skewness of Wavelet Transformed image,Curtosis of Wavelet Transformed image,Entropy of image
0,3.62160,8.66610,-2.8073,-0.44699
1,4.54590,8.16740,-2.4586,-1.46210
2,3.86600,-2.63830,1.9242,0.10645
3,3.45660,9.52280,-4.0112,-3.59440
4,0.32924,-4.45520,4.5718,-0.98880
...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949
1368,-1.38870,-4.87730,6.4774,0.34179
1369,-3.75030,-13.45860,17.5932,-2.77710
1370,-3.56370,-8.38270,12.3930,-1.28230


In [8]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1367    1
1368    1
1369    1
1370    1
1371    1
Name: classification, Length: 1372, dtype: int64

# 3. feature_selection

We select features based on their variance. If the variance of a column is below the defined value *var_thrs* then it is removed. We also make sure to get rid of the "id" column, useless for training and prediction.

In [9]:
var_thrs = 0.05
X_selected = feature_selection(var_thrs, X)

In [10]:
print(f"Number of features before selection : {np.shape(X)[1]}")
print(f"Number of features after selection  : {np.shape(X_selected)[1]}")

Number of features before selection : 4
Number of features after selection  : 4


In [11]:
X_selected

array([[  3.6216 ,   8.6661 ,  -2.8073 ,  -0.44699],
       [  4.5459 ,   8.1674 ,  -2.4586 ,  -1.4621 ],
       [  3.866  ,  -2.6383 ,   1.9242 ,   0.10645],
       ...,
       [ -3.7503 , -13.4586 ,  17.5932 ,  -2.7771 ],
       [ -3.5637 ,  -8.3827 ,  12.393  ,  -1.2823 ],
       [ -2.5419 ,  -0.65804,   2.6842 ,   1.1952 ]])

# 4. center_normalize

To get more accurate results, we make sure that the mean of each column is equal to 0 and the variance is 1.

In [12]:
X_normalized = center_normalize(X_selected)

In [13]:
print(f"Shape of normalized data : {np.shape(X_normalized)}")

Shape of normalized data : (1372, 4)


In [14]:
print("Esperance and variance for each column :")
np.around([(X_normalized[:,i].mean(), X_normalized[:,i].var())  for i in range(np.shape(X_normalized)[1])], decimals=2)

Esperance and variance for each column :


array([[ 0.,  1.],
       [ 0.,  1.],
       [ 0.,  1.],
       [-0.,  1.]])

In [15]:
X_normalized

array([[ 1.12180565,  1.14945512, -0.97597007,  0.35456135],
       [ 1.44706568,  1.06445293, -0.89503626, -0.12876744],
       [ 1.20780971, -0.77735215,  0.12221838,  0.61807317],
       ...,
       [-1.47235682, -2.62164576,  3.75901744, -0.75488418],
       [-1.40669251, -1.75647104,  2.552043  , -0.04315848],
       [-1.04712236, -0.43982168,  0.29861555,  1.1364645 ]])

# 5. split_training_test

We then split the dataset into a training set and a testing set.

In [16]:
train_size = 0.5 # Percentage of the size of the training set over the whole data
X_train, X_test, y_train, y_test = split_training_test(X_normalized, y,train_size)

In [17]:
print(f"Shape of the original data : {np.shape(X_normalized)}")
print(f"Shape of the training data : {np.shape(X_train)}")
print(f"Shape of the testing data  : {np.shape(X_test)}")

Shape of the original data : (1372, 4)
Shape of the training data : (686, 4)
Shape of the testing data  : (686, 4)


In [18]:
print(f"Shape of the original class : {np.shape(y)}")
print(f"Shape of the training class : {np.shape(y_train)}")
print(f"Shape of the testing class  : {np.shape(y_test)}")

Shape of the original class : (1372,)
Shape of the training class : (686,)
Shape of the testing class  : (686,)


# 6. cross_validation

Next, we find the cross-validation score for each estimator.

In [19]:
estimators_dict = {
        "Linear SVC" : LinearSVC(),
        "Naive Bayes" : GaussianNB(),
        "SGD Classifier" : SGDClassifier(),
        "KNeighbors Classifier" : KNeighborsClassifier(),
        "Random Forest" : RandomForestClassifier()
                        }

In [20]:
cross_dict = {name : [] for name in estimators_dict.keys()}
for esti_name in estimators_dict.keys() :
    cross_dict[esti_name] = cross_validation(estimators_dict[esti_name], X_train, y_train)

In [21]:
for estimator in cross_dict :
    print(f"{estimator} :")
    print(f"Cross-validation score : {cross_dict[estimator]}")
    print("\n=====================================================\n")

Linear SVC :
Cross-validation score : [0.97816594 0.98253275 0.99122807]


Naive Bayes :
Cross-validation score : [0.83406114 0.79912664 0.84649123]


SGD Classifier :
Cross-validation score : [0.95633188 0.98253275 0.98245614]


KNeighbors Classifier :
Cross-validation score : [1. 1. 1.]


Random Forest :
Cross-validation score : [0.99563319 0.97816594 0.99561404]




# 7 train_validate_model

Finally, we can train our model and output the results.

In [22]:
estimator_name = "Linear SVC"

cross_val_score_value = cross_validation(estimators_dict[estimator_name], X_train, y_train)
y_pred, model_accuracy_score, cm = train_validate_model(estimators_dict[estimator_name], X_train, y_train, X_test, y_test)
ret_solo = {estimator_name: (y_pred, model_accuracy_score, cm, cross_val_score_value)}

In [23]:
ret_solo

{'Linear SVC': (array([1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1,
         1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
         0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
         1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
         1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
         0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
         1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
         1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0,
         0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
         1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
         1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
         1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
         1,

In [24]:
results_dict = {name : [] for name in estimators_dict.keys()}
for esti_name in estimators_dict.keys() :
    cross_val_score_value = cross_validation(estimators_dict[esti_name], X_train, y_train)
    y_pred, model_accuracy_score, cm = train_validate_model(estimators_dict[esti_name], X_train, y_train, X_test, y_test)
    results_dict[esti_name] = y_pred, model_accuracy_score, cm, cross_val_score_value
ret_all =  results_dict

In [25]:
for estimator in results_dict :
    print(f"{estimator} :")
    print(results_dict[estimator][0])
    print(f"Score of model accuracy : {results_dict[estimator][1]}")
    print(f"Confusion matrix :\n{results_dict[estimator][2]}")
    print(f"Cross-validation score : {results_dict[estimator][3]}")
    print("\n=====================================================\n")

Linear SVC :
[1 1 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 1 1 1 1 1 0
 0 1 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 1 0 0 0 1
 1 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 0 1 0 0 0 1 1 1 1 1 0 1
 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 1 1 0 1 1 1 1 0
 1 1 1 1 0 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 1
 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0
 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 1 1 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 1 1 0
 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 0
 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 1 0 0 1 1
 0 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 0 1 1 0 1 1 1 0 0
 1 1 0 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 0 0 1 1 0 0 0 1 0 0 0 1
 1 1 0 1 1 1