In [1]:
#get rid of annoying GPU warnings (and others)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import cv2
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm, metrics, datasets
from sklearn.utils import Bunch
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC

In [2]:
def load_image_files(container_path, dimension=(128,128)):
    image_dir = Path(container_path)
    folders = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = [fo.name for fo in folders]

    descr = "Your own dataset"
    images = []
    flat_data = []
    target = []
    for i, direc in enumerate(folders):
        for file in direc.iterdir():
            if file.suffix.lower() not in ['.jpg', '.jpeg', '.png','.webp','.gif']:
                print(f"Skipped non-image file: {file}")
                continue
            img = cv2.imread(str(file), cv2.IMREAD_COLOR)
            if img is None:
                print(f"Failed to read image: {file}")
                continue
            # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert BGR to RGB format
            img_resized = cv2.resize(img, dimension, interpolation=cv2.INTER_AREA)
            
            if img_resized.size == 0:
                print(f"Empty image: {file}")
                continue
            flat_data.append(img_resized.flatten())
            images.append(img_resized)
            target.append(i)
    flat_data = np.array(flat_data)
    target = np.array(target)
    images = np.array(images)

   # Perform data augmentation
    # augmented_data_bunch = augment_images(images, target)

    # Return both the original and augmented data
    return Bunch(
        data=flat_data,
        target=target,
        target_names=categories,
        images=images,
        DESCR=descr
    )

In [3]:
image_dataset = load_image_files("/kaggle/input/soil-data-v3/Soil_Data_V3/Trains")
image_dataset_test = load_image_files("/kaggle/input/soil-data-v3/Soil_Data_V3/Tests")

Skipped non-image file: /kaggle/input/soil-data-v3/Soil_Data_V3/Trains/Mary/desktop.ini
Skipped non-image file: /kaggle/input/soil-data-v3/Soil_Data_V3/Trains/Sand/desktop.ini
Skipped non-image file: /kaggle/input/soil-data-v3/Soil_Data_V3/Trains/Silt/desktop.ini
Skipped non-image file: /kaggle/input/soil-data-v3/Soil_Data_V3/Tests/Chalky/desktop.ini


In [10]:
all_X = np.concatenate((image_dataset.images, image_dataset_test.images), axis =0)
all_Y =np.concatenate((image_dataset.target, image_dataset_test.target), axis =0)

In [12]:
all_X=all_X.reshape(-1,128*128*3)
print(all_X.shape)
# # all_images_test=image_dataset_test.images.reshape(-1,30*30*3)
# print(all_test.shape)

(5254, 49152)


In [13]:
from sklearn.preprocessing import MinMaxScaler
X = MinMaxScaler().fit_transform(all_X)
y = all_Y

In [14]:
rseed = 42

In [15]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
def feature_gridsearch(model):
    rseed = 42
    print(model)
    feature_extractors = ['PCA', 'LDA', 'None']
    lda_n_comp = [4, 5, 6, 7]
    pca_n_comp =  [150, 200, 500]
    lda = LDA()
    pca = PCA()
    output_model = []
    
    for feature_extractor in feature_extractors: 
        if feature_extractor == "PCA":
            print('PCA')
            output_pca = []
            for n_comp in pca_n_comp:
                print(n_comp)
                y = all_Y
                X = MinMaxScaler().fit_transform(all_X)
                pca = PCA(n_components = n_comp)
                X = pca.fit(X, y).transform(X)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rseed)
                accuracy_pca = cross_val_score(model, X_train, y_train, scoring='accuracy', cv =5, n_jobs=-1)
                print("Acc: ", accuracy_pca)
                print("Mean: ", accuracy_pca.mean()*100)
                print("Std: ", accuracy_pca.std()*100)
                output_pca.append(accuracy_pca)
            output_model.append(output_pca)
            print("Output: ", output_pca)
        elif feature_extractor == "LDA":
            print("LDA")
            output_lda = []
            for n_comp in lda_n_comp: 
                print(n_comp)
                y = all_Y
                X = MinMaxScaler().fit_transform(all_X)
                lda = LDA(n_components = n_comp)
                X = lda.fit(X, y).transform(X)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rseed)
                accuracy_lda = cross_val_score(model, X_train, y_train, scoring='accuracy', cv =5, n_jobs=-1)
                print("Acc: ", accuracy_lda)
                print("Mean: ", accuracy_lda.mean()*100)
                print("STD: ", accuracy_lda.std()*100)
                output_lda.append(accuracy_lda)
            output_model.append(output_pca)
        else: #None
            print('None')
            y = all_Y
            X = MinMaxScaler().fit_transform(all_Y)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rseed)
            accuracy_none = cross_val_score(model, X_train, y_train, scoring='accuracy', cv =5, n_jobs=-1)
            print(accuracy_none)
            print(accuracy_none.mean()*100)
            print(accuracy_none.std()*100)
            output_model.append(accuracy_none)
    return output_model

In [None]:
from sklearn.svm import SVC
svm = SVC(random_state=rseed, C=10, gamma='scale',kernel='rbf',class_weight='balanced' )
pca_lda_tuning = feature_gridsearch(svm)
print(pca_lda_tuning)

SVC(C=10, class_weight='balanced', random_state=42)
PCA
150
Acc:  [0.79891304 0.83016304 0.8244898  0.80952381 0.8122449 ]
Mean:  81.50669180715762
Std:  1.110244246527338
200
Acc:  [0.80434783 0.82880435 0.83673469 0.80544218 0.8244898 ]
Mean:  81.99637681159422
Std:  1.2920054070523683
500
Acc:  [0.8423913  0.83695652 0.8462585  0.81904762 0.83945578]
Mean:  83.68219461697723
Std:  0.9412207721450423
Output:  [array([0.79891304, 0.83016304, 0.8244898 , 0.80952381, 0.8122449 ]), array([0.80434783, 0.82880435, 0.83673469, 0.80544218, 0.8244898 ]), array([0.8423913 , 0.83695652, 0.8462585 , 0.81904762, 0.83945578])]
LDA
4
Acc:  [0.97554348 0.98369565 0.99591837 0.99319728 0.98639456]
Mean:  98.69498669032829
STD:  0.7217591572681334
5
Acc:  [0.97690217 0.98369565 0.99727891 0.99319728 0.98639456]
Mean:  98.74937148772553
STD:  0.7156743441380681
6
