In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.preprocessing import image 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
from tensorflow.image import rgb_to_grayscale

Using TensorFlow backend.


Visit [**GoogleDrive**](https://drive.google.com/file/d/1QM5NqLIFuYIwCjUwtl5jM24YVHs3L4Kb/view?usp=sharing) for csv file and images

In [2]:
def normal_nonnormal(x): 
    if x == 'Normal': 
        return x 
    else: 
        return 'Non-Normal'

df = pd.read_csv('../CombinedImages/CombinedUpdated.csv')
na_fill = {'VirusCategory1': 'Normal'}
df = df.fillna(value = na_fill) #switch na to normal (dataset error)

df.VirusCategory1 = df.VirusCategory1.map(normal_nonnormal)
df = df.join(pd.get_dummies(df.VirusCategory1.values, prefix = 'type')) #one hot
df = df[['ImagePath', 'VirusCategory1', 'type_Normal']] #only columns needed
X = df[['ImagePath', 'VirusCategory1']]
y = df[['type_Normal']]

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state = 10, stratify = X.VirusCategory1.values,
                                                   train_size = .90)
print(x_train.VirusCategory1.value_counts())
x_train = x_train.drop('VirusCategory1', axis = 1) #only using this category to stratify 
x_test = x_test.drop('VirusCategory1', axis = 1)

def get_image_value(path): 
    '''This function will retrive the RGB array for an image given its path'''
    img = image.load_img(path, target_size = (28,28,3))
    img = image.img_to_array(img)
    
    return img/255


def get_data(df): 
    '''This function will retrive the paths for each item within a sample, and call get_image_value to retrieve
    the RGB array for each image'''
    from tqdm import tqdm
    img_list = [] 
    for path in tqdm(df.ImagePath.values, desc = 'Gathering Image Arrays'):
        path = f'../CombinedImages/all/{path}'
        img_list.append(get_image_value(path)) 
    return np.array(img_list).squeeze()
x_test = get_data(x_test)
x_train = get_data(x_train)

Gathering Image Arrays:   1%|▏         | 9/648 [00:00<00:07, 85.84it/s]

Normal        5257
Non-Normal     572
Name: VirusCategory1, dtype: int64


Gathering Image Arrays: 100%|██████████| 648/648 [00:08<00:00, 74.13it/s] 
Gathering Image Arrays: 100%|██████████| 5829/5829 [01:26<00:00, 67.49it/s]


In [3]:
import pickle 
from imblearn.over_sampling import SMOTE


smote = SMOTE() 
print('Old Shape', x_train.shape)
x_train_resampled, y_train_resampled = smote.fit_sample(x_train.reshape(len(x_train), 28*28*3), y_train)
print('New Shape', x_train_resampled.shape)

#smote
pickle.dump(x_train_resampled, open('pickles/SMOTE_x_train.p', 'wb'))
pickle.dump(y_train_resampled, open('pickles/SMOTE_y_train.p', 'wb'))

#normal 
pickle.dump(y_test, open('pickles/Normal_y_test.p', 'wb'))
pickle.dump(x_test, open('pickles/Normal_x_test.p', 'wb'))
pickle.dump(x_train, open('pickles/Normal_x_train.p', 'wb'))
pickle.dump(y_train, open('pickles/Normal_y_train.p', 'wb'))


Old Shape (5829, 28, 28, 3)
New Shape (10514, 2352)
