<a href="https://colab.research.google.com/github/MyraLugwiri/group-assignment1/blob/main/MachineLearning_GroupAssignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Metal Surface Defects

This dataset was downloaded from NEU Metal Surface Defects Database which contains six kinds of typical surface defects of the hot-rolled steel strip are collected, i.e., rolled-in scale (RS), patches (Pa), crazing (Cr), pitted surface (PS), inclusion (In) and scratches (Sc). The database includes 1,800 grayscale images: 300 samples each of six different kinds of typical surface defects.

The dataset was selected because it focuses on

Data Source Url = https://www.kaggle.com/datasets/fantacher/neu-metal-surface-defects-data

In [2]:
! python --version

Python 3.10.12


In [None]:
!pip install --upgrade scikit-learn



In [None]:
# importing the necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16, preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Preprocessing

In [None]:
import os
# load the dataset
train_path = '/content/drive/MyDrive/Machine_Learning/assignments/GroupAssignment/NEU Metal Surface Defects Data/train/'
test_path = '/content/drive/MyDrive/Machine_Learning/assignments/GroupAssignment/NEU Metal Surface Defects Data/test/'


In [None]:
defect_classes = os.listdir(test_path)
for i, defect_type in enumerate(defect_classes) :
    defect_path = os.path.join(test_path , defect_type)
    label = defect_type
    print(defect_type)
    # for image_file  in os.listdir(defect_path):
    #   print(image_file)

Scratches
Crazing
Rolled
Pitted
Inclusion
Patches


In [None]:
# the preprocessing pipeline for training data
def preprocess_data(data_classes, file_size):
  """
  preprocesses the data and returns the train data and
  the associated labels

  """
  train_data = []
  data_labels = []
  # Travesing the data file paths to access
  for defect_type in data_classes:
    defect_path = os.path.join(train_path, defect_type)
    label = defect_type

    for image_file  in os.listdir(defect_path):
      img_path = os.path.join(defect_path, image_file)
      the_img = image.load_img(img_path, target_size=file_size) # loading the images into PIL format and convert the images from BMP to RGB
      the_img_array = image.img_to_array(the_img)
      the_img_array = np.expand_dims(the_img_array, axis=0)
      the_img_array = preprocess_input(the_img_array)
      the_img_array = the_img_array / 255.0

      # appending the preprocessed images to the list
      train_data.append(the_img_array)
      # appending the labels of the data to the list
      data_labels.append(label)

  return np.vstack(train_data), np.array(data_labels)
file_size = (224, 224) # 224 because VGG16 deals with images that are of the size 224
train_data_path = os.listdir(train_path)
X_train, Y_train = preprocess_data(train_data_path, file_size)




In [None]:
# preprocessing pipeline for test data
def preprocess_test_data(test_data_path, file_size):
  """
  preprocesses the data and returns the test data and
  the associated labels

  """
  test_data = []
  test_data_labels = []
  # Travesing the data file paths to access
  for defect_type in test_data_path:
    defect_path = os.path.join(test_path, defect_type)
    label = defect_type

    for image_file  in os.listdir(defect_path):

      img_path = os.path.join(defect_path, image_file)
      the_img = image.load_img(img_path, target_size=file_size) # loading the images into PIL format and convert the images from BMP to RGB
      test_array = image.img_to_array(the_img)
      test_array = np.expand_dims(test_array, axis=0)
      test_array = preprocess_input(test_array)
      test_array = test_array / 255.0

      # appending the preprocessed images to the list
      test_data.append(test_array)
      # appending the labels of the data to the list
      test_data_labels.append(label)

  return np.vstack(test_data), np.array(test_data_labels)
test_data_path = os.listdir(test_path)
X_test, Y_test = preprocess_test_data(test_data_path, file_size)

In [None]:
# Encoding the test labels
labelling = LabelEncoder()
# Y_test =  Y_test.reshape(-1,  1)
labelling.fit(Y_test)
Y_test = labelling.fit_transform(Y_test)
# Y_train  = Y_train.reshape(-1, 1)
labelling.fit(Y_train)
Y_train = labelling.fit_transform(Y_train)

### Feature Extraction

In [None]:
#loading the VGG16 Model
VGG_model= VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# ensuring the VGG16 layers are untrainable
for layer in VGG_model.layers:
  layer.trainable = False
# checking if the trainable layers are 0
VGG_model.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
# Extracting features for X_train
extract_features = VGG_model.predict(X_train)



In [None]:
# reshaping the x_train features extracted from VGG16
features = extract_features.reshape(extract_features.shape[0], -1)
# new_X_train = extract_features

In [None]:
# extracting features for X_test
xtest_extract_features = VGG_model.predict(X_test)
x_test_features = xtest_extract_features.reshape(xtest_extract_features.shape[0], -1)



### Testing out Classification Model
> 4 classification models will be tested to identify the best performing one which will then be saved to be used later

The following machine learning classifiers will be tested to find the best classifier model:


*   Support Vector Machine
*   Random Forest Classifier
*   Naive Bayes(GaussianNB)
*   Logistic Classifier





In [None]:
# importing all the necessary classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# creating a list of the models
# models = [GaussianNB(), RandomForestClassifier(), LogisticRegression(max_iter=1000), SVC(probability=True)]
models = [RandomForestClassifier()]

# defining a for loop that will loop through the models to train and calculate accuracy and loss
column_labels = ['classifier', 'Accuracy']
loss_calculated = pd.DataFrame(columns=column_labels)

for clf in models:
  name = clf.__class__.__name__
  clf.fit(features, Y_train)

  print('='*30)
  print(name)
  print('****Results****')
  predictions = clf.predict(x_test_features)
  accuracy_s = accuracy_score(Y_test, predictions)
  print('Accuracy Score: {:.4f}'.format(accuracy_s*100))
  predictions = labelling.inverse_transform(predictions)

  calculated_metrics = pd.DataFrame([[name, accuracy_s*100]], columns=column_labels)
  loss_calculated = pd.concat([loss_calculated, calculated_metrics], ignore_index=True)


RandomForestClassifier
****Results****
Accuracy Score: 100.0000
The prediction is: ['Scratches' 'Scratches' 'Scratches' 'Scratches' 'Scratches' 'Scratches'
 'Scratches' 'Scratches' 'Scratches' 'Scratches' 'Scratches' 'Scratches'
 'Crazing' 'Crazing' 'Crazing' 'Crazing' 'Crazing' 'Crazing' 'Crazing'
 'Crazing' 'Crazing' 'Crazing' 'Crazing' 'Crazing' 'Rolled' 'Rolled'
 'Rolled' 'Rolled' 'Rolled' 'Rolled' 'Rolled' 'Rolled' 'Rolled' 'Rolled'
 'Rolled' 'Rolled' 'Pitted' 'Pitted' 'Pitted' 'Pitted' 'Pitted' 'Pitted'
 'Pitted' 'Pitted' 'Pitted' 'Pitted' 'Pitted' 'Pitted' 'Inclusion'
 'Inclusion' 'Inclusion' 'Inclusion' 'Inclusion' 'Inclusion' 'Inclusion'
 'Inclusion' 'Inclusion' 'Inclusion' 'Inclusion' 'Inclusion' 'Patches'
 'Patches' 'Patches' 'Patches' 'Patches' 'Patches' 'Patches' 'Patches'
 'Patches' 'Patches' 'Patches' 'Patches']


The selected model is RandomForestClassifier because it allows the calculations of the probabilities of the presence of any of
the 6 defects present in an image

In [None]:
# importing joblib so that we can save the trained LogisticRegression Model
import joblib
joblib.dump(clf, 'rf_classifier.sav')

['rf_classifier.sav']

In [None]:
# saving the labelEncoder
joblib.dump(labelling, 'label_encoder.sav')


['label_encoder.sav']