In [92]:
import os
import cv2
import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [66]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = np.array(X_train).reshape((60000, 784))
X_test = np.array(X_test).reshape((10000, 784))

In [67]:
cols = [f'px_{i+1:03d}' for i in range(784)]

In [68]:
X_tr = pd.DataFrame(X_train, columns=cols)
X_tr['label'] = list(y_train)
X_ts = pd.DataFrame(X_test, columns=cols)
X_ts['label'] = list(y_test)

In [69]:
X = X_tr.append(X_ts, ignore_index = True)

In [70]:
len(X[X['label'] == 0])

6903

In [71]:
X_NoZero = X[X['label'] != 0]
len(X_NoZero)

63097

In [114]:
def vflip(x):
    return x[::-1, :]

def hflip(x):
    return x[:, ::-1]

def invert(x):
    return x[::-1, ::-1]

new_zeroes = []
for file in os.listdir("/content/drive/MyDrive/Sudoku/Zeroes"):
    img = cv2.imread(os.path.join("/content/drive/MyDrive/Sudoku/Zeroes", file), 0)
    if img is not None:
        img = np.array(img)
        new_zeroes.append(img.reshape(784))
        new_zeroes.append(hflip(img).reshape(784))
        new_zeroes.append(vflip(img).reshape(784))
        new_zeroes.append(invert(img).reshape(784))

for i in range(6903 - len(new_zeroes)):
    px = [np.random.choice([0, 128, 255], p = [0.95, 0.025, 0.025]) for j in range(784)]
    new_zeroes.append(px)
    if i % 1000 == 999:
        print(f"{i+1} done")

1000 done
2000 done
3000 done
4000 done
5000 done
6000 done


In [115]:
print(len(new_zeroes))

6903


In [116]:
print(new_zeroes[222])

[0, 255, 0, 0, 128, 0, 0, 0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 0, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 128, 0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255, 0, 0, 0, 0, 0, 0, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
new_zeroes_temp = new_zeroes[220:240]
fig, ax = plt.subplots(nrows = 5, ncols = 4, figsize = (20, 20))
for i, axi in enumerate(ax.flat):
    img = np.array(new_zeroes_temp[i]).reshape((28,28))
    axi.imshow(img, cmap = 'gray')
plt.show()

In [119]:
X_OnlyZero = pd.DataFrame(new_zeroes, columns = cols)
X_OnlyZero['label'] = [0 for _ in range(len(X_OnlyZero))]
X_revised = X_NoZero.append(X_OnlyZero, ignore_index = True)

In [122]:
X_revised['label'].value_counts()

1    7877
7    7293
3    7141
2    6990
9    6958
0    6903
6    6876
8    6825
4    6824
5    6313
Name: label, dtype: int64

In [121]:
X_revised = X_revised.sample(frac = 1).reset_index(drop = True)

In [123]:
X_revised.to_csv('/content/drive/MyDrive/Sudoku/Mnist_Sudoku.csv', index=False)

In [124]:
len(X_revised)

70000