In [1]:
import numpy as np
import pandas as pd
import mahotas as mh
import matplotlib.pyplot as plt
import cv2 
import os
import csv
import glob
import skimage
import itertools
from scipy import stats
from scipy.stats import zscore
from skimage.feature import greycomatrix, greycoprops
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

# Extraction de caractéristiques (Feature extraction)

-------------------------------------------------------------------------------------------------------------------------------
- à l'aide des algorithmes de traitement d'images
_______________________________________________________________________________________________________________________________
-------------------------------------------------------------------------------------------------------------------------------

    Descripteur 1: Les moments statistiques de couleur
    Descripteur 2: La forme
    Descripteur 3 : Histogramme {espace HSV}
    Descripteur 4 : Texture ( GLCM, [Homogeneity, Correlation, Contrast, Energy])
-------------------------------------------------------------------------------------------------------------------------------
_______________________________________________________________________________________________________________________________

In [2]:
def colorf(image):
    
    '''Fonction affiche la caractéristique coleur de taille 1x6'''
    
    R = image[:,:,0]
    G = image[:,:,1]
    B = image[:,:,2]
    colorFeature = [
        np.mean(R), np.std(R),
        np.mean(G), np.std(G),
        np.mean(B), np.std(B)
    ]
    colorFeature = np.asarray(colorFeature)
    return colorFeature

In [3]:
def formef(image):
    
    '''Fonction affiche la caractéristique forme de taille 1x7 utilisé sera créé à partir des moments invariants de Hu'''
    
    #converture l'image en RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # extraire les moments invariants de Hu.
    shape = cv2.HuMoments(cv2.moments(image)).flatten()
    feature = -np.sign(shape) * np.log10(np.abs(shape))
    return feature

In [4]:
def hsvHistogramf(img):
    
    '''Fonction affiche la caractéristique Histogramme de taille 1x32 pour les composantes : (Hue) le nivaux choisi est 8
    pour (saturation) est 2 et pour (value) 2'''
    
    rows, cols, numOfBands = img.shape[:]
    # Convertir de RGB image en HSV 
    img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
    # Extraire les 3 composantes
    h = img[:,:,0]
    s = img[:,:,1]
    v = img[:,:,2]
    numberOfLevelsForH = 8 
    numberOfLevelsForS = 2 
    numberOfLevelsForV = 2
    # calcule de maximum valeur de chaque composante
    maxValueForH = np.max(h)
    maxValueForS = np.max(s)
    maxValueForV = np.max(v)
    # crér la matrice sur l'aquelle on stocke les valeurs de l'histogramme
    hsvColor_Histogram = np.zeros((8, 2, 2))
    quantizedValueForH = np.ceil( h.dot(numberOfLevelsForH) / maxValueForH)
    quantizedValueForS = np.ceil( s.dot(numberOfLevelsForS) / maxValueForS)
    quantizedValueForV = np.ceil( v.dot(numberOfLevelsForV) / maxValueForV)
    index = np.zeros((rows*cols, 3))
    index[:,0] = quantizedValueForH.reshape(1,-1).reshape(1,quantizedValueForH.shape[0] * quantizedValueForH.shape[1]) 
    index[:,1] = quantizedValueForS.reshape(1,-1).reshape(1,quantizedValueForS.shape[0] * quantizedValueForS.shape[1]) 
    index[:,2] = quantizedValueForV.reshape(1,-1).reshape(1,quantizedValueForV.shape[0] * quantizedValueForV.shape[1])
    k=0
    for row in range(len(index[:,0])):
        if index[row,0] == 0 or index[row,1] == 0 or index[row,2] == 0:
            k+=1
            continue
        hsvColor_Histogram[int(index[row,0])-1,int(index[row,1])-1,int(index[row,2])-1] = hsvColor_Histogram[int(index[row,0])-1,int(index[row,1])-1,int(index[row,2])-1] + 1
    hsvColor_Histogram = hsvColor_Histogram[:].reshape(1,-1)
    # Reshape 1*32
    return hsvColor_Histogram.reshape(-1)

In [5]:
def texturef(image):
    
    '''Fonction affiche la caractéristique texture de taille 1x4'''
    
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = skimage.img_as_ubyte(image)
    #Calcule de matrice GLCM
    glcm = greycomatrix(image, [1], [0], 256, symmetric=True, normed=True)
    # extraction des moments statistiques [Homogeneity, Correlation, Contrast, Energie.]
    feature = greycoprops(glcm, 'dissimilarity')[0]
    feature = np.concatenate([feature,greycoprops(glcm, 'correlation')[0]])
    feature = np.concatenate([feature,greycoprops(glcm, 'contrast')[0]])
    feature = np.concatenate([feature,greycoprops(glcm, 'energy')[0]])
    return feature

In [6]:
def extract_feature_from_data(folder):
    
    '''Fonction affiche la matrice des caractéristiques pour les données d'apprentissage '''
    
    train_y = []
    y = pd.read_csv(folder+'.csv')
    y.set_index('image_id',inplace=True)
    i = 0
    for filename in os.listdir(os.path.join(folder)):
        train_y.append(y.loc[filename.replace(".jpg", "")].values[0])
        
        im = cv2.imread(os.path.join(os.path.join(folder),filename))
        # l'extraction des caractéristiques
        #Descripteur 1 : mean_std compose de 6 caractéristiques
        vecteur1 = colorf(im)
        #Descripteur 2 : forme compose de 7 caractéristiques
        vecteur2 = formef(im)
        #Descripteur 3 : histogramme compse de 32 caractéristiques
        vecteur3 = hsvHistogramf(im)
        #Descriprteur 4 : texture compose de 4 caractéristiques
        vecteur4 = texturef(im)
        # Vecteur descripteur
        feature=np.hstack((vecteur1, vecteur2, vecteur3, vecteur4))
        if i==0:
            features = np.zeros(feature.shape[0])
            i=i+1
        features = np.vstack((features,feature))
    return np.delete(features, (0), axis=0),np.asarray(train_y)

# Training

_______________________________________________________________________________________________________________________________

In [7]:
# L'extraction des caractéristiques de la base d'apprentissage
train_features, train_labels = extract_feature_from_data('Training')

In [10]:
X_train, y_train = np.array(training_features), np.array(training_labels)

In [12]:
X_train.shape

(2000, 49)

In [13]:
# Normalization de matrice des caractéristiques en utilisant la méthode Z-score
scaler = preprocessing.StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

# Validation

_______________________________________________________________________________________________________________________________

In [14]:
# L'extraction des caractéristiques de la base de validation
valid_features, valid_labels = extract_feature_from_data('Validation')

In [15]:
X_valid, y_valid = np.array(valid_features), np.array(valid_labels)

In [16]:
X_valid.shape

(150, 49)

In [17]:
# Normalization de matrice des caractéristiques en utilisant la méthode Z-score
X_valid_scale = scaler.fit_transform(X_valid)

-------------------------------------------------------------------------------------------------------------------------------
_______________________________________________________________________________________________________________________________

# Entrainer le modéle par différents classificateurs
-------------------------------------------------------------------------------------------------------------------------------
- à l'aide des algorithmes de DATA-MINING
-------------------------------------------------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------------------------------------------------
##### K-Nearest Neighbors Classifier
-------------------------------------------------------------------------------------------------------------------------------

In [20]:
knn = KNeighborsClassifier(n_neighbors = 5, weights = 'distance')
knn.fit(X_train_scale, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='distance')

In [23]:
knn_y = knn.predict(X_valid)
print("accuracy of knn is",accuracy_score(knn_y, y_valid))
print(classification_report(knn_y, y_valid))

accuracy of knn is 0.8
              precision    recall  f1-score   support

         0.0       0.99      0.80      0.89       148
         1.0       0.03      0.50      0.06         2

    accuracy                           0.80       150
   macro avg       0.51      0.65      0.48       150
weighted avg       0.98      0.80      0.88       150



-------------------------------------------------------------------------------------------------------------------------------
##### Random forest : Modèle de Forêt d'arbres décisionnels
-------------------------------------------------------------------------------------------------------------------------------

In [24]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
rfc_y = rfc.predict(X_valid)
print("accuracy of random forrest is", accuracy_score(rfc_y, y_valid))
print(classification_report(rfc_y, y_valid))

accuracy of random forrest is 0.8133333333333334
              precision    recall  f1-score   support

         0.0       0.98      0.82      0.89       144
         1.0       0.13      0.67      0.22         6

    accuracy                           0.81       150
   macro avg       0.56      0.74      0.56       150
weighted avg       0.95      0.81      0.87       150



-------------------------------------------------------------------------------------------------------------------------------
##### Support vector machine (SVM)
-------------------------------------------------------------------------------------------------------------------------------

In [40]:
svm = SVC(kernel='linear', max_iter=100000, gamma=0.001)
svm.fit(X_train_scale, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=100000, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
svm_y = svm.predict(X_valid)
print("accuracy of svm is",accuracy_score(svm_y, y_valid))
print(classification_report(svm_y, y_valid))

accuracy of svm is 0.7933333333333333
              precision    recall  f1-score   support

         0.0       0.99      0.80      0.88       149
         1.0       0.00      0.00      0.00         1

    accuracy                           0.79       150
   macro avg       0.50      0.40      0.44       150
weighted avg       0.99      0.79      0.88       150



-------------------------------------------------------------------------------------------------------------------------------
##### Naive Bayes classifiers
-------------------------------------------------------------------------------------------------------------------------------

In [31]:
nby = MultinomialNB()
nby.fit(abs(X_train_scale), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [33]:
nby_y = nby.predict(X_valid)
print("accuracy of naive_bayes is",accuracy_score(nby_y, y_valid))
print(classification_report(nby_y, y_valid))

accuracy of naive_bayes is 0.7866666666666666
              precision    recall  f1-score   support

         0.0       0.95      0.81      0.88       140
         1.0       0.13      0.40      0.20        10

    accuracy                           0.79       150
   macro avg       0.54      0.61      0.54       150
weighted avg       0.90      0.79      0.83       150



# Test
_______________________________________________________________________________________________________________________________

In [42]:
def load_features_from_folder_for_test(folder):
    i=0

    for filename in os.listdir(os.path.join(folder)):
        
        im = cv2.imread(os.path.join(folder,filename))
        # l'extraction des caractéristiques
        #Descripteur 1 : mean_std compose de 6 caractéristiques
        vecteur1 = colorf(im)
        #Descripteur 2 : forme compose de 7 caractéristiques
        vecteur2 = formef(im)
        #Descripteur 3 : histogramme compse de 32 caractéristiques
        vecteur3 = hsvHistogramf(im)
        #Descriprteur 4 : texture compose de 4 caractéristiques
        vecteur4 = texturef(im)
        # Vecteur descripteur
        feature=np.hstack((vecteur1, vecteur2, vecteur3, vecteur4))
        #feature=np.hstack((shape))
        #feature=np.hstack((shape,texture))
        if i==0:
            features = np.zeros(feature.shape[0])
            i=i+1
        features = np.vstack((features,feature))

    return np.delete(features, (0), axis=0)

In [43]:
X_test = load_features_from_folder_for_test("Test")

In [44]:
X_test.shape

(600, 49)

-------------------------------------------------------------------------------------------------------------------------------
Puisque j'ai obtenu une mielleur accuracy à l'aide de classifeur forêt aléatoire je le utilise pour tester mon le Test.
--------------------------------------------------------------------------------------------------------------------------------------
_______________________________________________________________________________________________________________________________

In [45]:
test_rfc = rfc.predict(X_test)

In [46]:
test_rfc

array([0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [47]:
names = [name for name in os.listdir('Test')]
test_yy = np.column_stack((names, test_rfc))
test_yy = test_yy.astype(object)

In [48]:
np.savetxt("Result_using_randforest.csv", test_yy, delimiter = ',', header = "image_id,melanoma", comments = '', fmt = '%5s')

# Course Image-Mining
-------------------------------------------------------------------------------------------------------------------------------
    Une compétition prépare à la classe.
    
    Kaggle--Cancer_Melanome
-------------------------------------------------------------------------------------------------------------------------------
_______________________________________________________________________________________________________________________________