# 2301888682 - ICHSAN - CASE
## A.
### Step 1: Read DNA and Fashion data

In [1]:
import pandas as pd
import tensorflow as tf

data_dna = pd.read_csv("rawdata.csv")
data_fashion = tf.keras.datasets.fashion_mnist.load_data()
(x_train_fashion, y_train_fashion), (x_test_fashion, y_test_fashion) = data_fashion

data_dna, data_fashion

(   SAMPLE_ID  snp_0  snp_1  snp_2  snp_3  snp_4  snp_5  snp_6  snp_7  snp_8  \
 0     HCB181      1      0      0      1      1      2      2      2      2   
 1     HCB182      1      0      0      1      1      2      2      1      2   
 2     HCB183      1      0      0      1      2      2      2      1      2   
 3     HCB184      1      0      0      1      1      2      2      1      2   
 4     HCB185      1      0      0      1      1      2      2      1      2   
 ..       ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
 84    JPT265      1      0      0      1      1      1      1      2      2   
 85    JPT266      1      0      0      1      2      2      2      1      2   
 86    JPT267      1      0      0      1      2      1      2      2      2   
 87    JPT268      1      0      0      1      2      2      2      2      2   
 88    JPT269      1      0      0      1      2      2      2      2      2   
 
     ...  snp_9992  snp_9993  snp_9994

### Step 2: Determine x and y values of DNA and Fashion data

In [2]:
import numpy as np

x_dna = data_dna.drop(columns = ["SAMPLE_ID", "STATUS"])
y_dna = data_dna[["STATUS"]]

x_fashion = np.append(x_train_fashion, x_test_fashion, axis = 0)
y_fashion = np.append(y_train_fashion, y_test_fashion, axis = 0)

x_dna.shape, y_dna.shape, x_fashion.shape, y_fashion.shape

((89, 10001), (89, 1), (70000, 28, 28), (70000,))

### Step 3: Normalize x and y values of DNA and Fashion data

In [3]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

def normalize(scaler, data):
    scaler.fit(data)
    return scaler, scaler.transform(data)


scaler_x_dna, x_dna_norm = normalize(MinMaxScaler(), x_dna)
scaler_y_dna, y_dna_norm = normalize(OneHotEncoder(sparse = False), y_dna)

x_fashion_norm = x_fashion / 255.0
scaler_y_fashion, y_fashion_norm = normalize(OneHotEncoder(sparse = False), y_fashion.reshape(-1, 1))


print("Data DNA:")
print("x:", x_dna_norm.min(), x_dna_norm.max())
print("Data Fashion:")
print("x:", x_fashion_norm.min(), x_fashion_norm.max())

print()

print("Data DNA:    ", x_dna_norm.shape, y_dna_norm.shape)
print("Data Fashion:", x_fashion_norm.shape, y_fashion_norm.shape)

Data DNA:
x: 0.0 1.0
Data Fashion:
x: 0.0 1.0

Data DNA:     (89, 10001) (89, 2)
Data Fashion: (70000, 28, 28) (70000, 10)


### Step 4: Find PCA of DNA and Fashion data

In [4]:
from sklearn.decomposition import PCA

def pca(model, data):
    model.fit(data)
    return model, model.transform(data)


pca_dna, x_dna_pca = pca(PCA(n_components = 0.95), x_dna_norm)
pca_fashion, x_fashion_pca = pca(PCA(n_components = 0.95), x_fashion_norm.reshape(-1, 28*28))

x_dna_pca = x_dna_pca.reshape(len(x_dna_pca), -1, 1)
x_dna_norm = x_dna_norm.reshape(len(x_dna_norm), -1, 1)
x_fashion_pca = x_fashion_pca.reshape(len(x_fashion_pca), -1, 1)
x_fashion_norm = x_fashion_norm.reshape(len(x_fashion_norm), 28, 28, 1)


print("Data DNA:    ", x_dna_pca.shape, x_dna_norm.shape, y_dna_norm.shape)
print("Data Fashion:", x_fashion_pca.shape, x_fashion_norm.shape, y_fashion_norm.shape)

Data DNA:     (89, 82, 1) (89, 10001, 1) (89, 2)
Data Fashion: (70000, 188, 1) (70000, 28, 28, 1) (70000, 10)


### Step 5: Split Training, Validation, and Testing set
#### DNA data: Train 80%, Validation 10%, Test 10%
#### Fashion data: Train 60000, Validation 5000, Test 5000

In [5]:
def train_val_test_split(split, data):
    data_train = data[:split]
    sisa = data[split:]
    
    split = int(0.5*len(sisa))
    data_val = sisa[:split]
    data_test = sisa[split:]
    
    return data_train, data_val, data_test


train_size_dna = int(0.8*len(x_dna))
x_train_dna_pca, x_val_dna_pca, x_test_dna_pca = train_val_test_split(train_size_dna, x_dna_pca)
x_train_dna, x_val_dna, x_test_dna = train_val_test_split(train_size_dna, x_dna_norm)
y_train_dna, y_val_dna, y_test_dna = train_val_test_split(train_size_dna, y_dna_norm)


train_size_fashion = 60000
x_train_fashion_pca, x_val_fashion_pca, x_test_fashion_pca = train_val_test_split(train_size_fashion, x_fashion_pca)
x_train_fashion, x_val_fashion, x_test_fashion = train_val_test_split(train_size_fashion, x_fashion_norm)
y_train_fashion, y_val_fashion, y_test_fashion = train_val_test_split(train_size_fashion, y_fashion_norm)


print("Data DNA:")
print("\tx    ", x_train_dna.shape, x_val_dna.shape, x_test_dna.shape)
print("\tx_pca", x_train_dna_pca.shape, x_val_dna_pca.shape, x_test_dna_pca.shape)
print("\ty    ", y_train_dna.shape, y_val_dna.shape, y_test_dna.shape)
print()
print("Data Fashion:")
print("\tx    ", x_train_fashion.shape, x_val_fashion.shape, x_test_fashion.shape)
print("\tx_pca", x_train_fashion_pca.shape, x_val_fashion_pca.shape, x_test_fashion_pca.shape)
print("\ty    ", y_train_fashion.shape, y_val_fashion.shape, y_test_fashion.shape)

Data DNA:
	x     (71, 10001, 1) (9, 10001, 1) (9, 10001, 1)
	x_pca (71, 82, 1) (9, 82, 1) (9, 82, 1)
	y     (71, 2) (9, 2) (9, 2)

Data Fashion:
	x     (60000, 28, 28, 1) (5000, 28, 28, 1) (5000, 28, 28, 1)
	x_pca (60000, 188, 1) (5000, 188, 1) (5000, 188, 1)
	y     (60000, 10) (5000, 10) (5000, 10)


### Step 6: Initialize DataFrame to Summarize Accuracy Result

In [6]:
result = pd.DataFrame(index = ["PCA + CNN", "CNN"], columns = ["DNA dataset", "Fashion MNIST dataset"])
result

Unnamed: 0,DNA dataset,Fashion MNIST dataset
PCA + CNN,,
CNN,,


### Step 7: Build Model, Train Model, Test Model, and Evaluate performance

In [9]:
from tensorflow.keras.layers import Conv1D, Conv2D, Flatten, Dense, MaxPooling1D, MaxPooling2D
from tensorflow.keras import Sequential

def build_model(dimensi, output_shape):
    model = Sequential()
    
    if(dimensi == 1):
        model.add(Conv1D(32, 3, activation = "relu"))
        model.add(Conv1D(16, 3, activation = "relu"))
        model.add(MaxPooling1D(pool_size = 3, strides = 1, padding = "valid"))
        model.add(Conv1D(8, 3, activation = "relu"))
        model.add(MaxPooling1D(pool_size = 2, strides = 1, padding = "valid"))
    elif(dimensi == 2):
        model.add(Conv2D(32, (3, 3), activation = "relu"))
        model.add(Conv2D(16, (3, 3), activation = "relu"))
        model.add(MaxPooling2D(pool_size = (3, 3), strides = (1, 1), padding='valid'))
        model.add(Conv2D(8, (3, 3), activation = "relu"))
        model.add(MaxPooling2D(pool_size = (2, 2), strides = (1, 1), padding='valid'))
    
    model.add(Flatten())
    model.add(Dense(16, activation = "sigmoid"))
    model.add(Dense(output_shape, activation = "softmax"))
    model.compile(optimizer = "SGD", loss = "CategoricalCrossentropy", metrics = ["accuracy"])
    
    return model


class_dna = len(y_train_dna[0])

CNN_dna = build_model(1, class_dna)
CNN_dna.fit(x_train_dna, y_train_dna, validation_data = (x_val_dna, y_val_dna), epochs = 10)
_, accuracy = CNN_dna.evaluate(x_test_dna, y_test_dna)
result["DNA dataset"]["CNN"] = str(accuracy*100) + "%"
print("DNA dataset - (CNN) =", result["DNA dataset"]["CNN"])
print("\n")

PCA_CNN_dna = build_model(1, class_dna)
PCA_CNN_dna.fit(x_train_dna_pca, y_train_dna, validation_data = (x_val_dna_pca, y_val_dna), epochs = 10)
_, accuracy = PCA_CNN_dna.evaluate(x_test_dna_pca, y_test_dna)
result["DNA dataset"]["PCA + CNN"] = str(accuracy*100) + "%"
print("DNA dataset - (PCA + CNN) =", result["DNA dataset"]["PCA + CNN"])
print("\n")


class_fashion = len(y_train_fashion[0])

CNN_fashion = build_model(2, class_fashion)
CNN_fashion.fit(x_train_fashion, y_train_fashion, validation_data = (x_val_fashion, y_val_fashion), epochs = 10)
_, accuracy = CNN_fashion.evaluate(x_test_fashion, y_test_fashion)
result["Fashion MNIST dataset"]["CNN"] = str(accuracy*100) + "%"
print("Fashion MNIST dataset - (CNN) =", result["Fashion MNIST dataset"]["CNN"])
print("\n")

PCA_CNN_fashion = build_model(1, class_fashion)
PCA_CNN_fashion.fit(x_train_fashion_pca, y_train_fashion, validation_data = (x_val_fashion_pca, y_val_fashion), epochs = 10)
_, accuracy = PCA_CNN_fashion.evaluate(x_test_fashion_pca, y_test_fashion)
result["Fashion MNIST dataset"]["PCA + CNN"] = str(accuracy*100) + "%"
print("Fashion MNIST dataset - (PCA + CNN) =", result["Fashion MNIST dataset"]["PCA + CNN"])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
DNA dataset - (CNN) = 66.66666865348816%


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
DNA dataset - (PCA + CNN) = 66.66666865348816%


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fashion MNIST dataset - (CNN) = 86.11999750137329%


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fashion MNIST dataset - (PCA + CNN) = 83.92000198364258%


### Step 8: Show Result

In [12]:
result

Unnamed: 0,DNA dataset,Fashion MNIST dataset
PCA + CNN,66.66666865348816%,83.92000198364258%
CNN,66.66666865348816%,86.11999750137329%
