In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.utils import to_categorical

### Loading the whole dataset

In [2]:
X_train = pd.read_csv(
    '/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/x_train_all.csv')

X_test = pd.read_csv(
    '/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/x_test_all.csv')

y_train = pd.read_csv(
    "/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/y_train_all.csv")

y_test = pd.read_csv(
    "/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/y_test_all.csv")

In [3]:
y_train["0"].nunique()

10

In [4]:
image_height = 48
image_width = 48
num_channels = 1
num_classes = 10
num_epochs = 10
batch_size = 64

In [5]:
# Reshape X_train to match the expected input shape
X_train_reshaped = np.reshape(
    X_train, (X_train.shape[0], image_height, image_width, num_channels))

# Reshape X_test to match the expected input shape
X_test_reshaped = np.reshape(
    X_test, (X_test.shape[0], image_height, image_width, num_channels))

# Convert y_train to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes)

# Convert y_test to one-hot encoded format
y_test_encoded = to_categorical(y_test, num_classes)

In [6]:
X_train_reshaped.shape

(9690, 48, 48, 1)

In [7]:
y_train_encoded.shape

(9690, 10)

### Let's build a simple CNN model to see if it can learn the patterns in the data.

In [8]:
# Define the CNN model
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(image_height, image_width, num_channels)))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train_encoded,
          epochs=num_epochs, batch_size=batch_size)
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2023-11-29 23:46:56.869214: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-29 23:46:56.869688: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/10


2023-11-29 23:46:57.148309: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-11-29 23:46:57.360351: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
model.evaluate(X_train_reshaped, y_train_encoded)

 13/303 [>.............................] - ETA: 2s - loss: 0.0133 - accuracy: 0.9952    

2023-11-29 23:47:20.997170: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




[0.03241579234600067, 0.990815281867981]

In [10]:
model.evaluate(X_test_reshaped, y_test_encoded)



[0.8833621740341187, 0.9016181230545044]

### It's obvious that acrhitecture of CNN is not enough to get good results. Let's try to modify it.

### Here we built a model with comprehensive architecture. It has maxpooling and dropout layers in addition to convolutional and dense layers.

In [12]:
from keras.layers import Dropout

# Define the CNN model with dropout layers
model = Sequential()
model.add(Conv2D(64, (3, 3), activation='relu', input_shape=(image_height, image_width, num_channels)))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Add dropout layer
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Add dropout layer
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Add dropout layer
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))  # Add dropout layer
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Add dropout layer
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train_encoded, epochs=num_epochs, batch_size=batch_size)

Epoch 1/10


2023-11-29 23:47:57.478993: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29e85bdc0>

In [13]:
model.evaluate(X_train_reshaped, y_train_encoded)

  5/303 [..............................] - ETA: 4s - loss: 0.0273 - accuracy: 0.9937      

2023-11-29 23:48:52.063050: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




[0.01163543201982975, 0.9975232481956482]

In [14]:
model.evaluate(X_test_reshaped, y_test_encoded)



[0.04968493804335594, 0.9789643883705139]

### As expected making more sophisticated architecture gives much higher result. Here we got 97.9% accuracy on the test set

# Let's do exactly the same thing, but using SMOTE to oversample the minority classes

In [15]:
X_train = pd.read_csv(
    '/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/x_train_all.csv')

X_test = pd.read_csv(
    '/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/x_test_all.csv')

y_train = pd.read_csv(
    "/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/y_train_all.csv")

y_test = pd.read_csv(
    "/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/y_test_all.csv")

In [16]:
y_train['0'].value_counts()

0
2    2250
1    2220
4    1980
3    1410
8     540
6     360
9     270
7     240
0     210
5     210
Name: count, dtype: int64

### Using SMOTE to oversample the minority classes

In [17]:
from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE
smote = SMOTE()

# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the class distribution after oversampling
print(y_train_resampled['0'].value_counts())


0
0    2250
1    2250
2    2250
3    2250
4    2250
5    2250
6    2250
7    2250
8    2250
9    2250
Name: count, dtype: int64


In [18]:
image_height = 48
image_width = 48
num_channels = 1
num_classes = 10
num_epochs = 10
batch_size = 64

In [19]:
# Reshape X_train to match the expected input shape
X_train_reshaped = np.reshape(
    X_train, (X_train.shape[0], image_height, image_width, num_channels))

# Reshape X_test to match the expected input shape
X_test_reshaped = np.reshape(
    X_test, (X_test.shape[0], image_height, image_width, num_channels))

# Convert y_train to one-hot encoded format
y_train_encoded = to_categorical(y_train, num_classes)

# Convert y_test to one-hot encoded format
y_test_encoded = to_categorical(y_test, num_classes)

In [20]:
from keras.layers import Dropout

# Define the CNN model with dropout layers
model = Sequential()
model.add(Conv2D(64, (3, 3), activation='relu', input_shape=(image_height, image_width, num_channels)))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Add dropout layer
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Add dropout layer
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Add dropout layer
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))  # Add dropout layer
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))  # Add dropout layer
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_reshaped, y_train_encoded, epochs=num_epochs, batch_size=batch_size)

Epoch 1/10


2023-11-29 23:49:02.508135: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x3304c3fa0>

In [21]:
model.evaluate(X_train_reshaped, y_train_encoded)

  1/303 [..............................] - ETA: 1:20 - loss: 0.0323 - accuracy: 0.9688

2023-11-29 23:49:58.359551: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




[0.013910314999520779, 0.9963880181312561]

In [22]:
model.evaluate(X_test_reshaped, y_test_encoded)



[0.05055157467722893, 0.9864077568054199]

### Here we got 98.6% accuracy on the test set due to producing synthetic data.

### As we can see generating synthetic data gives us nice improvement in accuracy which means that for NN this is a good approach