In [68]:
!git clone https://github.com/NGOVANMANH/DataMiningProject.git

fatal: destination path 'DataMiningProject' already exists and is not an empty directory.


In [69]:
import os
import numpy as np
import cv2
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.mobilenet import MobileNet, preprocess_input

In [70]:
IMG_WIDTH = 224
IMG_HEIGHT = 224
BATCH = 32

In [71]:
# Load the pre-trained MobileNet model + higher level layers
model = MobileNet(weights='imagenet', include_top=False, pooling='avg')



In [72]:
# Function to load images and labels from a directory
def load_images_from_folder(folder, class_names):
    images = []
    labels = []
    for class_name in class_names:
        class_folder = os.path.join(folder, class_name)
        for filename in os.listdir(class_folder):
            img_path = os.path.join(class_folder, filename)
            img = cv2.imread(img_path)
            if img is not None:
                img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                images.append(img.flatten())
                labels.append(class_name)  # Use the class name as the label
    return np.array(images), np.array(labels)

In [73]:
# Function to preprocess and extract features from an image directory in batches
def extract_features_batch(directory, batch_size=32, target_size=(IMG_WIDTH, IMG_HEIGHT)):
    features = []
    labels = []
    for label in os.listdir(directory):
        label_dir = os.path.join(directory, label)
        image_paths = [os.path.join(label_dir, image_path) for image_path in os.listdir(label_dir)]
        num_images = len(image_paths)
        for i in range(0, num_images, batch_size):
            batch_paths = image_paths[i:i+batch_size]
            batch_images = []
            for image_path in batch_paths:
                img = load_img(image_path, target_size=target_size)
                img_array = img_to_array(img)
                img_array = preprocess_input(img_array)
                batch_images.append(img_array)
            batch_images = np.array(batch_images)
            batch_features = model.predict(batch_images)
            features.extend(batch_features)
            labels.extend([label] * len(batch_features))
    return np.array(features), np.array(labels)

In [74]:
# Directories for the dataset
train_dir = '/content/DataMiningProject/dataset/train'
test_dir = '/content/DataMiningProject/dataset/test'

In [75]:
# Class names
class_names = ['dog', 'cat']

In [76]:
# Load training and testing data using the feature extraction method
X_train, y_train = extract_features_batch(train_dir)
X_test, y_test = extract_features_batch(test_dir)



In [77]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (21000, 1024)
y_train shape: (21000,)


In [None]:
# Giả sử X là tập dữ liệu đầu vào và y là nhãn tương ứng
k_values = [k for k in range(1, 102) if k % 2 != 0]
cross_val_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    cross_val_scores.append(scores.mean())

In [None]:
# Tìm giá trị k có độ chính xác cao nhất
optimal_k = k_values[np.argmax(cross_val_scores)]
print(f"Giá trị k tối ưu là: {optimal_k}")

In [None]:
# Vẽ đồ thị
plt.figure(figsize=(20, 10))
plt.plot(k_values, cross_val_scores, marker='o', linestyle='-', color='b')
plt.xlabel('Giá trị k')
plt.ylabel('Độ chính xác trung bình')
plt.title('Độ chính xác trung bình với các giá trị k khác nhau')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [None]:
# Train a K-Nearest Neighbors classifier
k = optimal_k
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

In [None]:

# Predict the test set results
y_pred = knn.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

In [None]:
# Plot the confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=class_names)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plot example images from each class in the train set
num_images = 10  # Number of images to display from each class
total_classes = len(class_names)
plt.figure(figsize=(20, 10))

for i, class_name in enumerate(class_names):
    class_folder = os.path.join(train_dir, class_name)
    image_files = os.listdir(class_folder)[:num_images]

    for j, filename in enumerate(image_files):
        img_path = os.path.join(class_folder, filename)
        img = cv2.imread(img_path)

        plt.subplot(total_classes, num_images, i * num_images + j + 1)
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(class_name if j == 0 else "")  # Only show class name for the first image in each row
        plt.axis('off')

plt.suptitle('Example Images from Each Class', fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.95])  # Adjust layout to make room for the title
plt.show()


In [None]:
# Hàm để lấy ảnh từ một thư mục
def load_images_from_directory(directory, target_size=(128, 128)):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_dir = os.path.join(directory, label)
        image_paths = [os.path.join(label_dir, image_path) for image_path in os.listdir(label_dir)]
        for image_path in image_paths:
            img = load_img(image_path, target_size=target_size)
            images.append(img)
            labels.append(label)
    return np.array(images), np.array(labels)

# Hiển thị một số hình ảnh dự đoán đúng và sai
def display_predictions(images, labels, predictions, title):
    correct = [i for i, (true, pred) in enumerate(zip(labels, predictions)) if true == pred]
    incorrect = [i for i, (true, pred) in enumerate(zip(labels, predictions)) if true != pred]

    plt.figure(figsize=(20, 10))  # Tăng kích thước hiển thị hình ảnh
    plt.suptitle(title)
    for i, idx in enumerate(correct[:5]):
        plt.subplot(2, 5, i + 1)
        img = images[idx]
        img = img.astype(np.uint8)  # Chuyển đổi về kiểu uint8 để hiển thị đúng màu
        plt.imshow(img.astype(np.uint8))  # Hiển thị ảnh ở định dạng RGB
        plt.title(f'True: {labels[idx]}\nPred: {predictions[idx]}')
        plt.axis('off')

    for i, idx in enumerate(incorrect[:5]):
        plt.subplot(2, 5, i + 6)
        img = images[idx]
        img = img.astype(np.uint8)  # Chuyển đổi về kiểu uint8 để hiển thị đúng màu
        plt.imshow(img.astype(np.uint8))  # Hiển thị ảnh ở định dạng RGB
        plt.title(f'True: {labels[idx]}\nPred: {predictions[idx]}')
        plt.axis('off')

# Sử dụng hàm hiển thị
test_images, test_labels = load_images_from_directory(test_dir)
display_predictions(test_images, test_labels, y_pred, 'Correct and Incorrect Predictions')

plt.show()

In [None]:
# Hàm để dự đoán lớp của một ảnh cụ thể và hiển thị điểm số cao nhất
def test_model(image_path, model, knn, class_names, target_size=(IMG_WIDTH, IMG_HEIGHT)):
    # Load và tiền xử lý ảnh
    img = load_img(image_path, target_size=target_size)
    img_array = img_to_array(img)
    img_array = preprocess_input(img_array)
    img_array = np.expand_dims(img_array, axis=0)

    # Trích xuất đặc trưng sử dụng MobileNet
    features = model.predict(img_array)

    # Dự đoán lớp sử dụng KNN
    pred = knn.predict(features)
    pred_proba = knn.predict_proba(features)

    # Lấy lớp dự đoán và điểm số cao nhất
    predicted_class = pred[0]
    max_score = np.max(pred_proba)

    return predicted_class, max_score

In [None]:
# Example usage
image_path = '/content/DataMiningProject/samples/dog.jpg'
predicted_class, max_score = test_model(image_path)
print(f'The predicted class for the image is: {predicted_class}')
print(f'The confidence score for the prediction is: {max_score:.2f}')


In [None]:
# Example usage
image_path = '/content/DataMiningProject/samples/cat.jpg'
predicted_class, max_score = test_model(image_path)
print(f'The predicted class for the image is: {predicted_class}')
print(f'The confidence score for the prediction is: {max_score:.2f}')