In [17]:
#Importing all the required packages
import os
import numpy as np
from skimage import io, color, feature, transform
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression


In [18]:
# Paths to the images
cancer_folder = "C:/Users/SERVER/Desktop/ImageClassification/Lung_datasSet/lung_aca"
no_cancer_folder = "C:/Users/SERVER/Desktop/ImageClassification/Lung_datasSet/lung_n"
cancer_scc_folder = "C:/Users/SERVER/Desktop/ImageClassification/Lung_datasSet/lung_scc"

#File type accesptable
valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']


In [19]:
cancer_images =[io.imread(os.path.join(cancer_folder, image_path))
               for image_path in os.listdir(cancer_folder)
               if os.path.splitext(image_path)[1].lower() in valid_extensions]

no_cancer_images =[io.imread(os.path.join(no_cancer_folder, image_path))
               for image_path in os.listdir(no_cancer_folder)
               if os.path.splitext(image_path)[1].lower() in valid_extensions]

cancer_scc_images =[io.imread(os.path.join(cancer_scc_folder, image_path))
               for image_path in os.listdir(cancer_scc_folder)
               if os.path.splitext(image_path)[1].lower() in valid_extensions]

In [20]:
#hog feature extraction function
def extract_features(image):
    image_resized = transform.resize(image, (128,128))
    gray = color.rgb2gray(image_resized)
    hog_features = feature.hog(gray, pixels_per_cell =(8,8), cells_per_block=(2,2))
    return hog_features

In [21]:
#feature extraction using GLCM
def extract_glcm_features(image):
    if len(image.shape) == 3:
        gray = color.rgb2gray(image)
    else:
        gray = image
    image_8bit = (gray*255).astype(np.uint8)
    glcm = graycomatrix(image_8bit, [1], [0], 256, symmetric = True, normed = True)
    contrast = graycoprops(glcm, 'contrast')[0,0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0,0]
    dissimilarity = graycoprops(glcm, 'dissimilarity')[0,0]
    energy = graycoprops(glcm, 'energy')[0,0]
    correlation = graycoprops(glcm, 'correlation')[0,0]
    return [contrast, dissimilarity, homogeneity, energy, correlation]

In [22]:
#feature extraction using LBP
def extract_lbp_features(image):
    gray = color.rgb2gray(image)
    lbp = local_binary_pattern(gray, P = 8, R = 1, method = 'uniform')
    lbp_hist, _=np.histogram(lbp, bins = np.arange(0, lbp.max() + 1))
    lbp_hist = lbp_hist.astype(float)
    lbp_hist /=(lbp_hist.sum() + 1e-6)
    return lbp_hist

In [23]:
#extracting color using color histogram
def extract_color_histogram(image, bins = 256):
    if len(image.shape) == 2:
        hist = np.histogram(image, bins = bins, range=(0,256))[0]
    elif len(image.shape) == 3 and image.shape[2] == 3:
        hist_r = np.histogram(image[:,:,0], bins=bins, range=(0,256))[0]
        hist_g = np.histogram(image[:,:,0], bins=bins, range=(0,256))[0]
        hist_b = np.histogram(image[:,:,0], bins=bins, range=(0,256))[0]
        hist = np.concatenate((hist_r,hist_g,hist_b))
    else:
        raise ValueError("Unsupport image format")
    return hist
        

In [24]:
# function to combine both hog and glcm and lbp feature functions
def combined_features(image):
    hog_feat = extract_features(image)
    glcm_feat = extract_glcm_features(image)
    lbp_feat = extract_lbp_features(image)
    color_hist_feat = extract_color_histogram(image)
    return np.concatenate([hog_feat, glcm_feat, lbp_feat, color_hist_feat])

In [25]:
cancer_combined_features = np.array([combined_features(img) for img in cancer_images])
no_cancer_combined_features = np.array([combined_features(img) for img in no_cancer_images])
cancer_scc_combined_features = np.array([combined_features(img) for img in cancer_scc_images])



In [26]:
# Data preparation
X = np.vstack((cancer_combined_features, no_cancer_combined_features, cancer_scc_combined_features))

y = np.hstack((
    np.ones(cancer_combined_features.shape[0]),        # Label 1 for lung adenocarcinoma (ACA)
    -1 * np.ones(no_cancer_combined_features.shape[0]), # Label -1 for normal lung tissue
    2 * np.ones(cancer_scc_combined_features.shape[0])  # Label 2 for lung squamous cell carcinoma (SCC)
))


In [27]:
#Splitting the data into and train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0) 

In [28]:
logistic_classifier = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
logistic_classifier.fit(X_train, y_train)
y_pred = logistic_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 80.00%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# function to predict the validation images
def predict_image(image_path, classifier, feature_extractor):
    img = io.imread(image_path)
    features = feature_extractor(img)
    prediction = classifier.predict([features])
    return prediction[0]

In [30]:
# looking the images from the validation data
check_folder = "C:/Users/SERVER/Desktop/ImageClassification/Lung_datasSet/check"
for image_name in os.listdir(check_folder):
    if os.path.splitext(image_name)[1].lower() in valid_extensions:
        image_path = os.path.join(check_folder,image_name)
        predicted_class = predict_image(image_path, logistic_classifier, combined_features)

        if predicted_class == 1:
            print(f"The image {image_name} is cancerous.")
        elif predicted_class == 2:
            print(f"The image {image_name} is scc cancerous.")
        else:
            print(f"The image {image_name} is non cancerous.")
            




The image lungaca1.jpeg is cancerous.




The image lungaca2.jpeg is cancerous.




The image lungaca3.jpeg is cancerous.




The image lungaca4.jpeg is cancerous.




The image lungaca5.jpeg is cancerous.




The image lungn1.jpeg is non cancerous.




The image lungn2.jpeg is non cancerous.




The image lungn3.jpeg is non cancerous.




The image lungn4.jpeg is non cancerous.




The image lungn5.jpeg is non cancerous.




The image lungscc1.jpeg is scc cancerous.




The image lungscc2.jpeg is scc cancerous.




The image lungscc3.jpeg is scc cancerous.




The image lungscc4.jpeg is scc cancerous.
The image lungscc5.jpeg is scc cancerous.




In [31]:
# splitting the data into k folds and then feeding them to the model to avoid over fitting
from sklearn.model_selection import StratifiedKFold
k = 5
skf = StratifiedKFold(n_splits = k, shuffle = True, random_state = 5)

accuracies = []

for train_index, test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    logistic_classifier = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
    logistic_classifier.fit(X_train, y_train)
    y_pred = logistic_classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f'Accuracy: {accuracy*100:.2f}%')
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 68.33%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 71.67%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 73.33%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 78.33%
Accuracy: 78.33%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
average_accuracy = np.mean(accuracies)
print(f'Average accuracy over {k} folds:{average_accuracy * 100:.2f}%') 

Average accuracy over 5 folds:74.00%
