In [1]:

import numpy as np
import cv2
import pandas as pd

In [2]:
img = cv2.imread('Sandstone_Versa0180_image.tif')
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  

#Save original image pixels into a data frame. This is our Feature #1.
img2 = img.reshape(-1)
df = pd.DataFrame()
df['Original Image'] = img2

In [3]:
df

Unnamed: 0,Original Image
0,0
1,0
2,0
3,0
4,0
...,...
1019899,0
1019900,0
1019901,0
1019902,0


In [4]:
#Generate Gabor features
num = 1  #To count numbers up in order to give Gabor features a lable in the data frame
kernels = []
for theta in range(2):   #Define number of thetas
    theta = theta / 4. * np.pi
    for sigma in (1, 3):  #Sigma with 1 and 3
        for lamda in np.arange(0, np.pi, np.pi / 4):   #Range of wavelengths
            for gamma in (0.05, 0.5):   #Gamma values of 0.05 and 0.5
            
                
                gabor_label = 'Gabor' + str(num)  #Label Gabor columns as Gabor1, Gabor2, etc.
#                print(gabor_label)
                ksize=9
                kernel = cv2.getGaborKernel((ksize, ksize), sigma, theta, lamda, gamma, 0, ktype=cv2.CV_32F)    
                kernels.append(kernel)
                #Now filter the image and add values to a new column 
                fimg = cv2.filter2D(img2, cv2.CV_8UC3, kernel)
                filtered_img = fimg.reshape(-1)
                df[gabor_label] = filtered_img  #Labels columns as Gabor1, Gabor2, etc.
                print(gabor_label, ': theta=', theta, ': sigma=', sigma, ': lamda=', lamda, ': gamma=', gamma)
                num += 1  #Increment for gabor column label

Gabor1 : theta= 0.0 : sigma= 1 : lamda= 0.0 : gamma= 0.05
Gabor2 : theta= 0.0 : sigma= 1 : lamda= 0.0 : gamma= 0.5
Gabor3 : theta= 0.0 : sigma= 1 : lamda= 0.7853981633974483 : gamma= 0.05
Gabor4 : theta= 0.0 : sigma= 1 : lamda= 0.7853981633974483 : gamma= 0.5
Gabor5 : theta= 0.0 : sigma= 1 : lamda= 1.5707963267948966 : gamma= 0.05
Gabor6 : theta= 0.0 : sigma= 1 : lamda= 1.5707963267948966 : gamma= 0.5
Gabor7 : theta= 0.0 : sigma= 1 : lamda= 2.356194490192345 : gamma= 0.05
Gabor8 : theta= 0.0 : sigma= 1 : lamda= 2.356194490192345 : gamma= 0.5
Gabor9 : theta= 0.0 : sigma= 3 : lamda= 0.0 : gamma= 0.05
Gabor10 : theta= 0.0 : sigma= 3 : lamda= 0.0 : gamma= 0.5
Gabor11 : theta= 0.0 : sigma= 3 : lamda= 0.7853981633974483 : gamma= 0.05
Gabor12 : theta= 0.0 : sigma= 3 : lamda= 0.7853981633974483 : gamma= 0.5
Gabor13 : theta= 0.0 : sigma= 3 : lamda= 1.5707963267948966 : gamma= 0.05
Gabor14 : theta= 0.0 : sigma= 3 : lamda= 1.5707963267948966 : gamma= 0.5
Gabor15 : theta= 0.0 : sigma= 3 : lamda= 2

In [5]:
df

Unnamed: 0,Original Image,Gabor1,Gabor2,Gabor3,Gabor4,Gabor5,Gabor6,Gabor7,Gabor8,Gabor9,...,Gabor23,Gabor24,Gabor25,Gabor26,Gabor27,Gabor28,Gabor29,Gabor30,Gabor31,Gabor32
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019899,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1019900,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1019901,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1019902,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#Now, add a column in the data frame for the Labels
#For this, we need to import the labeled image
labeled_img = cv2.imread('Sandstone_Versa0180_mask.png')
#Remember that you can load an image with partial labels 
#But, drop the rows with unlabeled data

labeled_img = cv2.cvtColor(labeled_img, cv2.COLOR_BGR2GRAY)
labeled_img1 = labeled_img.reshape(-1)
df['Labels'] = labeled_img1

print(df.head())

   Original Image  Gabor1  Gabor2  Gabor3  Gabor4  Gabor5  Gabor6  Gabor7  \
0               0       0       0       0       0       0       0       0   
1               0       0       0       0       0       0       0       0   
2               0       0       0       0       0       0       0       0   
3               0       0       0       0       0       0       0       0   
4               0       0       0       0       0       0       0       0   

   Gabor8  Gabor9  ...  Gabor24  Gabor25  Gabor26  Gabor27  Gabor28  Gabor29  \
0       0       0  ...        0        0        0        0        0        0   
1       0       0  ...        0        0        0        0        0        0   
2       0       0  ...        0        0        0        0        0        0   
3       0       0  ...        0        0        0        0        0        0   
4       0       0  ...        0        0        0        0        0        0   

   Gabor30  Gabor31  Gabor32  Labels  
0        0       

In [7]:
print(df.Labels.unique())

[ 33 201 231  65]


In [8]:
print(df['Labels'].value_counts())

33     491532
231    439024
201     72927
65      16421
Name: Labels, dtype: int64


In [9]:
#Define the dependent variable that needs to be predicted (labels)
Y = df["Labels"].values

#Define the independent variables
X = df.drop(labels = ["Labels"], axis=1) 

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

In [14]:
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
model_RF = RandomForestClassifier(n_estimators = 10, random_state = 42)

# Train the model on training data
model_RF.fit(X_train, y_train)

#Test prediction on testing data. 
prediction_test_RF = model_RF.predict(X_test)

In [15]:
#ACCURACY METRICS
print("********* METRICS FOR IMBALANCED DATA *********")
#Let us check the accuracy on test data
from sklearn import metrics
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_test_RF))

********* METRICS FOR IMBALANCED DATA *********
Accuracy =  0.94734558610851


In [16]:
(unique, counts) = np.unique(prediction_test_RF, return_counts=True)
print(unique, counts)

[ 33  65 201 231] [196813   6352  27705 177092]


In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, prediction_test_RF)
print(cm)

[[192392      0   4038    226]
 [     0   6108      2    439]
 [  4246      4  18214   6660]
 [   175    240   5451 169767]]


In [18]:
#Print individual accuracy values for each class, based on the confusion matrix
print("Pixel 33 accuracy = ", cm[0,0] / (cm[0,0]+cm[1,0]+cm[2,0]+cm[3,0]))
print("Pixel 65 accuracy = ",   cm[1,1] / (cm[0,1]+cm[1,1]+cm[2,1]+cm[3,1]))
print("Pixel 201 accuracy = ",   cm[2,2] / (cm[0,2]+cm[1,2]+cm[2,2]+cm[3,2]))
print("Pixel 231 accuracy = ",   cm[3,3] / (cm[0,3]+cm[1,3]+cm[2,3]+cm[3,3]))

Pixel 33 accuracy =  0.9775370529385762
Pixel 65 accuracy =  0.9615869017632241
Pixel 201 accuracy =  0.6574264573181736
Pixel 231 accuracy =  0.9586373184559438


In [19]:
from sklearn.metrics import roc_auc_score

In [20]:
prob_y_test = model_RF.predict_proba(X_test)
print("ROC_AUC score for imbalanced data is:")
print(roc_auc_score(y_test, prob_y_test, multi_class='ovr', labels=[33, 65, 201, 231]))

ROC_AUC score for imbalanced data is:
0.973691632822041


In [21]:
#############################################################################
# Handling Imbalanced data
###########################################

In [22]:
#Up-sample minority class
from sklearn.utils import resample
print(df['Labels'].value_counts())

33     491532
231    439024
201     72927
65      16421
Name: Labels, dtype: int64


In [23]:
#Separate majority and minority classes
df_important = df[df['Labels'] == 201]
df_majority = df.loc[df['Labels'].isin([33, 231])]
df_minority = df[df['Labels'] == 65]

In [24]:
# Upsample minority class and other classes separately
# If not, random samples from combined classes will be duplicated and we run into
#same issue as before, undersampled remians undersampled.
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=400000,    # to match average class
                                 random_state=42) # reproducible results
 
df_important_upsampled = resample(df_important, 
                                 replace=True,     # sample with replacement
                                 n_samples=400000,    # to match average class
                                 random_state=42) # reproducible results


In [25]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_important_upsampled, df_minority_upsampled])
print(df_upsampled['Labels'].value_counts())

Y_upsampled = df_upsampled["Labels"].values

#Define the independent variables
X_upsampled = df_upsampled.drop(labels = ["Labels"], axis=1)

33     491532
231    439024
201    400000
65     400000
Name: Labels, dtype: int64


In [26]:
X_train_upsampled, X_test_upsampled, y_train_upsampled, y_test_upsampled = train_test_split(X_upsampled, 
                                                                                            Y_upsampled, 
                                                                                            test_size=0.2, 
                                                                                            random_state=20)

#Train again with new upsamples data
model_RF_upsampled = RandomForestClassifier(n_estimators = 10, random_state = 42)

# Train the model on training data
model_RF_upsampled.fit(X_train_upsampled, y_train_upsampled)
prediction_test_RF_upsampled = model_RF_upsampled.predict(X_test_upsampled)

In [27]:
print("********* METRICS FOR BALANCED DATA USING UPSAMPLING *********")

print ("Accuracy = ", metrics.accuracy_score(y_test_upsampled, prediction_test_RF_upsampled))

cm_upsampled = confusion_matrix(y_test_upsampled, prediction_test_RF_upsampled)
print(cm_upsampled)

print("Pixel 33 accuracy = ", cm_upsampled[0,0] / (cm_upsampled[0,0]+cm_upsampled[1,0]+cm_upsampled[2,0]+cm_upsampled[3,0]))
print("Pixel 65 accuracy = ",  cm_upsampled[1,1] / (cm_upsampled[0,1]+cm_upsampled[1,1]+cm_upsampled[2,1]+cm_upsampled[3,1]))
print("Pixel 201 accuracy = ",  cm_upsampled[2,2] / (cm_upsampled[0,2]+cm_upsampled[1,2]+cm_upsampled[2,2]+cm_upsampled[3,2]))
print("Pixel 231 accuracy = ",  cm_upsampled[3,3] / (cm_upsampled[0,3]+cm_upsampled[1,3]+cm_upsampled[2,3]+cm_upsampled[3,3]))

prob_y_test_upsampled = model_RF.predict_proba(X_test_upsampled)

print("ROC_AUC score for balanced data using upsampling is:")
print(roc_auc_score(y_test_upsampled, prob_y_test_upsampled, multi_class='ovr', labels=[33, 65, 201, 231]))

********* METRICS FOR BALANCED DATA USING UPSAMPLING *********
Accuracy =  0.9772732525887574
[[95284     1  2798    81]
 [    0 79690     0     0]
 [  145     0 79953   212]
 [   45   210  4374 83319]]
Pixel 33 accuracy =  0.9980099294048641
Pixel 65 accuracy =  0.9973592320496615
Pixel 201 accuracy =  0.9176814921090387
Pixel 231 accuracy =  0.9964957183179448
ROC_AUC score for balanced data using upsampling is:
0.9889651049934216


In [29]:
#Technique 5. Penalize learning algorithms that increase cost of classification mistakes
#on minority classes. Add class_weight='balanced'.
#ALso works for others like SVM. 

# Instantiate model with n number of decision trees
# class_weight = 'balanced'  --> classes are automatically weighted 
#inversely proportional to how frequently they appear in the data
model_penalized = RandomForestClassifier(n_estimators = 10, 
                               class_weight='balanced', # penalize
                               random_state = 42)
# Train the model on training data
model_penalized.fit(X_train, y_train)


prob_y_test_penalized = model_penalized.predict_proba(X_test)

print(roc_auc_score(y_test, prob_y_test_penalized, multi_class='ovr', labels=[33, 65, 201, 231]))

0.9730423977476995
