In [1]:
import os 
import cv2
import numpy as np
import pandas as pd
from skimage.feature import local_binary_pattern
from skimage.feature import hog
from skimage import exposure
from sklearn.pipeline import Pipeline
from skimage.color import rgb2gray
from skimage.io import imread
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import f_classif
from xgboost import XGBClassifier

path = r'C:\Users\gomes\OneDrive\Desktop\skin cancer prediction\train'
directory_contents = os.listdir(path)
print(directory_contents)


['benign', 'malignant']


In [2]:
benign_folder = os.path.join(path,'benign')
malignant_folder = os.path.join(path,'malignant')
benign_contents = os.listdir(benign_folder)
malignant_contents = os.listdir(malignant_folder)
print(len(benign_contents))
print(len(malignant_contents))

1437
1197


In [3]:
radius = 1  # Radius of the circle
n_points = 8 * radius  # Number of points in the LBP pattern

# Function to extract LBP features from an image
def extract_lbp_features(image_path, size=(224, 224), radius=1, n_points=8):
    # Read the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Resize the image
    image = cv2.resize(image, size)
    
    # Apply LBP
    lbp = local_binary_pattern(image, n_points, radius, method='uniform')
    
    # Flatten the LBP features into a vector
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))
    
    # Normalize the histogram
    lbp_hist = lbp_hist.astype("float")
    lbp_hist /= (lbp_hist.sum() + 1e-6)  # Normalize to make features comparable
    
    return lbp_hist

# Define the paths to the benign and malignant folders


# Initialize lists to hold the image data and labels
data = []
labels = []

# Process benign images
benign_images = [img for img in os.listdir(benign_folder) if img.endswith(('.jpg', '.png'))]
for img in benign_images:
    img_path = os.path.join(benign_folder, img)
    features = extract_lbp_features(img_path, radius=radius, n_points=n_points)
    data.append(features)
    labels.append(0)  # Label for benign

# Process malignant images
malignant_images = [img for img in os.listdir(malignant_folder) if img.endswith(('.jpg', '.png'))]
for img in malignant_images:
    img_path = os.path.join(malignant_folder, img)
    features = extract_lbp_features(img_path, radius=radius, n_points=n_points)
    data.append(features)
    labels.append(1)  # Label for malignant

# Convert the data and labels into a pandas DataFrame
df1 = pd.DataFrame(data)
df1['label'] = labels  # Add the labels as the target column

# Show the structure of the dataset
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,label
0,0.038843,0.067323,0.056441,0.135085,0.1535,0.147381,0.097736,0.081393,0.086077,0.136221,0
1,0.058534,0.079779,0.06238,0.106386,0.117566,0.120316,0.094846,0.092415,0.105509,0.162269,0
2,0.062301,0.077089,0.066307,0.095444,0.145687,0.114876,0.086775,0.086874,0.107601,0.157047,0
3,0.051279,0.076491,0.057079,0.102419,0.114019,0.126036,0.095564,0.092953,0.114178,0.169982,0
4,0.060387,0.08225,0.065131,0.100446,0.12494,0.120097,0.091418,0.088329,0.104412,0.162588,0


In [4]:
pixels_per_cell = (8, 8)
cells_per_block = (2, 2)
block_norm = 'L2-Hys'

# Function to extract HOG features from an image
def extract_hog_features(image_path, size=(224, 224)):
    # Read the image and convert to grayscale
    image = imread(image_path)
    gray_image = rgb2gray(image)
    
    # Resize the image
    gray_image = cv2.resize(gray_image, size)
    
    # Extract HOG features
    features, hog_image = hog(gray_image, pixels_per_cell=pixels_per_cell, 
                              cells_per_block=cells_per_block, 
                              block_norm=block_norm, 
                              visualize=True
                           )
    
    # Normalize the HOG features
    features = np.reshape(features, (-1,))
    
    return features



# Initialize lists to hold the image data and labels
data2 = []
labels2 = []

# Process benign images
benign_images = [img for img in os.listdir(benign_folder) if img.endswith(('.jpg', '.png'))]
for img in benign_images:
    img_path = os.path.join(benign_folder, img)
    lbp_features = extract_lbp_features(img_path)
    hog_features = extract_hog_features(img_path)
    features = np.concatenate([lbp_features, hog_features])
    data2.append(features)
    labels2.append(0)  # Label for benign

# Process malignant images
malignant_images = [img for img in os.listdir(malignant_folder) if img.endswith(('.jpg', '.png'))]
for img in malignant_images:
    img_path = os.path.join(malignant_folder, img)
    lbp_features = extract_lbp_features(img_path)
    hog_features = extract_hog_features(img_path)
    features = np.concatenate([lbp_features, hog_features])
    data2.append(features)
    labels2.append(1)  # Label for malignant

# Convert the data and labels into a pandas DataFrame
df2 = pd.DataFrame(data2)
df2['label'] = labels2  # Add the labels as the target column

# Show the structure of the dataset
df2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26245,26246,26247,26248,26249,26250,26251,26252,26253,label
0,0.038843,0.067323,0.056441,0.135085,0.1535,0.147381,0.097736,0.081393,0.086077,0.136221,...,0.20908,0.055502,0.053021,0.159992,0.253187,0.166946,0.06609,0.045677,0.099831,0
1,0.058534,0.079779,0.06238,0.106386,0.117566,0.120316,0.094846,0.092415,0.105509,0.162269,...,0.044372,0.004885,0.050033,0.030081,0.324138,0.440202,0.452316,0.012327,0.020235,0
2,0.062301,0.077089,0.066307,0.095444,0.145687,0.114876,0.086775,0.086874,0.107601,0.157047,...,0.045968,0.015845,0.037692,0.034202,0.139967,0.006029,0.040935,0.01105,0.040656,0
3,0.051279,0.076491,0.057079,0.102419,0.114019,0.126036,0.095564,0.092953,0.114178,0.169982,...,0.225249,0.168384,0.225249,0.166123,0.225249,0.120182,0.096582,0.167957,0.133703,0
4,0.060387,0.08225,0.065131,0.100446,0.12494,0.120097,0.091418,0.088329,0.104412,0.162588,...,0.196417,0.177307,0.145089,0.098221,0.165395,0.073855,0.195777,0.12299,0.094538,0


# LBP features

In [5]:
X = df1.drop(columns=['label'])
y = df1['label']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust parameters like n_estimators

# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 77.75%
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       433
           1       0.76      0.75      0.75       358

    accuracy                           0.78       791
   macro avg       0.78      0.77      0.78       791
weighted avg       0.78      0.78      0.78       791



In [6]:
X = df1.drop(columns=['label'])
y = df1['label']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = SVC(kernel = 'linear', random_state=42)  # You can adjust parameters like n_estimators

# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVC Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

SVC Accuracy: 72.44%
              precision    recall  f1-score   support

           0       0.70      0.86      0.77       433
           1       0.77      0.56      0.65       358

    accuracy                           0.72       791
   macro avg       0.74      0.71      0.71       791
weighted avg       0.73      0.72      0.72       791



# HOG features

In [7]:
X = df2.drop(columns=['label'])
y = df2['label']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust parameters like n_estimators

# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 75.35%
              precision    recall  f1-score   support

           0       0.74      0.84      0.79       433
           1       0.77      0.65      0.70       358

    accuracy                           0.75       791
   macro avg       0.76      0.74      0.75       791
weighted avg       0.76      0.75      0.75       791



In [8]:
X = df2.drop(columns=['label'])
y = df2['label']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = SVC(kernel = 'linear', random_state=42)  # You can adjust parameters like n_estimators

# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"SVC Accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

SVC Accuracy: 65.11%
              precision    recall  f1-score   support

           0       0.67      0.71      0.69       433
           1       0.62      0.58      0.60       358

    accuracy                           0.65       791
   macro avg       0.65      0.64      0.65       791
weighted avg       0.65      0.65      0.65       791



# Combining both HOG and LBP

In [9]:
pixels_per_cell = (8, 8)
cells_per_block = (2, 2)
block_norm = 'L2-Hys'

# Function to extract LBP features from an image
def extract_lbp_features(image_path, radius=1, n_points=8):
    image = imread(image_path)
    gray_image = rgb2gray(image)
    lbp = local_binary_pattern(gray_image, n_points, radius, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))
    lbp_hist = lbp_hist.astype('float32')
    lbp_hist /= (lbp_hist.sum() + 1e-6)
    return lbp_hist

# Function to extract HOG features from an image
def extract_hog_features(image_path, size=(128, 128)):
    image = imread(image_path)
    gray_image = rgb2gray(image)
    gray_image = cv2.resize(gray_image, size)
    features, _ = hog(gray_image, pixels_per_cell=pixels_per_cell, 
                      cells_per_block=cells_per_block, 
                      block_norm=block_norm, 
                      visualize=True)
    features = np.reshape(features, (-1,))
    return features

# Function to extract SHIFT features from an image
def extract_shift_features(image_path):
    # Dummy implementation of SHIFT features extraction
    # Replace this with actual SHIFT feature extraction code
    image = imread(image_path)
    gray_image = rgb2gray(image)
    # Example: Dummy feature vector
    shift_features = np.random.rand(50)  # Replace with real SHIFT feature extraction
    return shift_features

# Initialize lists to hold the image data and labels
data2 = []
labels2 = []

# Process benign images
benign_images = [img for img in os.listdir(benign_folder) if img.endswith(('.jpg', '.png'))]
for img in benign_images:
    img_path = os.path.join(benign_folder, img)
    lbp_features = extract_lbp_features(img_path)
    hog_features = extract_hog_features(img_path)
    shift_features = extract_shift_features(img_path)
    features = np.concatenate([lbp_features, hog_features, shift_features])
    data2.append(features)
    labels2.append(0)  # Label for benign

# Process malignant images
malignant_images = [img for img in os.listdir(malignant_folder) if img.endswith(('.jpg', '.png'))]
for img in malignant_images:
    img_path = os.path.join(malignant_folder, img)
    lbp_features = extract_lbp_features(img_path)
    hog_features = extract_hog_features(img_path)
    shift_features = extract_shift_features(img_path)
    features = np.concatenate([lbp_features, hog_features, shift_features])
    data2.append(features)
    labels2.append(1)  # Label for malignant

# Convert the data and labels into a pandas DataFrame
df3 = pd.DataFrame(data2)
df3['label'] = labels2  # Add the labels as the target column

# Show the structure of the dataset
df3.head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8151,8152,8153,8154,8155,8156,8157,8158,8159,label
0,0.058793,0.07376,0.078743,0.138353,0.160635,0.131696,0.088229,0.072923,0.069017,0.12785,...,0.262337,0.34938,0.974291,0.872176,0.82723,0.640319,0.694269,0.936848,0.187591,0
1,0.080038,0.08803,0.082131,0.11089,0.124123,0.112444,0.084104,0.084642,0.083825,0.149773,...,0.876277,0.548747,0.143924,0.670726,0.362662,0.519012,0.541883,0.089203,0.071163,0
2,0.080855,0.084263,0.081015,0.102041,0.146983,0.104412,0.078683,0.079939,0.089086,0.152722,...,0.890911,0.023037,0.419152,0.39764,0.589327,0.280211,0.284883,0.895165,0.952215,0
3,0.079859,0.086555,0.080357,0.107541,0.123864,0.113261,0.085738,0.080975,0.086794,0.155054,...,0.758861,0.155337,0.395783,0.231093,0.028636,0.877154,0.764112,0.385597,0.181252,0
4,0.083008,0.088349,0.080576,0.105409,0.129664,0.111009,0.081254,0.081892,0.08536,0.15348,...,0.655674,0.123435,0.119926,0.962019,0.268979,0.271085,0.392905,0.882937,0.862521,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()

# Feature selection
selector = SelectKBest(score_func=f_classif, k=50)

# Model
model = SVC()

# Pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('selector', selector),
    ('model', model)
])

# Train the model
pipeline.fit(X_train, y_train)

# Test the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with LBP, HOG, and SelectKBest: {accuracy:.2f}')

Accuracy with LBP, HOG, and SelectKBest: 0.81


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()

# Feature selection
selector = SelectKBest(score_func=f_classif, k=50)

# Model
model = RandomForestClassifier()

# Pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('selector', selector),
    ('model', model)
])

# Train the model
pipeline.fit(X_train, y_train)

# Test the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with LBP, HOG, and SelectKBest: {accuracy:.2f}')

Accuracy with LBP, HOG, and SelectKBest: 0.81


In [12]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()

# Feature selection
selector = SelectKBest(score_func=f_classif, k=50)

# Model
model = RandomForestClassifier()

# Pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('selector', selector),
    ('model', model)
])

# Simplified hyperparameter grid
param_grid = {
    'selector__k': [20, 30, 40],  # Number of features to select
    'model__n_estimators': [100, 150],  # Number of trees in the forest
    'model__max_depth': [None, 10],  # Maximum depth of the trees
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best score found:", grid_search.best_score_)

# Test the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with LBP, HOG, and SelectKBest: {accuracy:.2f}')


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters found: {'model__max_depth': 10, 'model__n_estimators': 100, 'selector__k': 40}
Best score found: 0.7987643513959304
Accuracy with LBP, HOG, and SelectKBest: 0.81


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()

# Feature selection
selector = SelectKBest(score_func=f_classif, k=50)

# Model
model = SVC()

# Pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('selector', selector),
    ('model', model)
])

# Simplified hyperparameter grid
param_grid = {
    'selector__k': [20, 30, 40],  # Number of features to select
    'model__C': [0.1, 1, 10],  # Regularization parameter
    'model__kernel': ['linear', 'rbf'],  # Kernel type
    'model__gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf' kernel
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best score found:", grid_search.best_score_)

# Test the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with LBP, HOG, and SelectKBest: {accuracy:.2f}')

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found: {'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'linear', 'selector__k': 30}
Best score found: 0.8025630218612675
Accuracy with LBP, HOG, and SelectKBest: 0.78


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()

# Feature selection
selector = SelectKBest(score_func=f_classif, k=50)

# Model
model = RandomForestClassifier()

# Pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('selector', selector),
    ('model', model)
])

# Hyperparameter grid
param_grid = {
    'selector__k': [10, 20, 30],  # Number of features to select
    'model__n_estimators': [100, 200],  # Number of trees in the forest
    'model__max_depth': [None,  20, 30],  # Maximum depth of the trees
    'model__min_samples_split': [ 5, 10],  # Minimum number of samples required to split an internal node
    'model__min_samples_leaf': [ 2, 4],  # Minimum number of samples required to be at a leaf node
    'model__bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best score found:", grid_search.best_score_)

# Test the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with LBP, HOG, and SelectKBest: {accuracy:.2f}')

Fitting 5 folds for each of 144 candidates, totalling 720 fits


KeyboardInterrupt: 

In [None]:
confusion = confusion_matrix(y_test,y_pred)
confusion

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Randomized hyperparameter grid
param_distributions = {
    'selector__k': [10, 20, 30, 40, 50],
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

# RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=20, cv=5, n_jobs=-1, verbose=2, random_state=42)

# Train the model with hyperparameter tuning
random_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", random_search.best_params_)
print("Best score found:", random_search.best_score_)

# Test the model
y_pred = random_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with LBP, HOG, and SelectKBest: {accuracy:.2f}')


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()

# Feature selection
selector = SelectKBest(score_func=f_classif, k=50)

# Model
model = XGBClassifier()

# Pipeline
pipeline = Pipeline([
    ('scaler', scaler),
    ('selector', selector),
    ('model', model)
])

# Hyperparameter grid
param_grid = {
    'selector__k': [ 20, 30],  # Number of features to select
    'model__n_estimators': [100, 200],  # Number of boosting rounds
    'model__learning_rate': [0.01, 0.1],  # Step size shrinkage
    'model__max_depth': [3, 5],  # Maximum depth of the trees
    'model__subsample': [0.8, 1.0],  # Fraction of samples used for fitting each individual tree
    'model__colsample_bytree': [ 1.0]  # Fraction of features used for each tree
}

# GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2)

# Train the model with hyperparameter tuning
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best score found:", grid_search.best_score_)

# Test the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy with LBP, HOG, and XGBoost: {accuracy:.2f}')

## 