In [1]:
import tensorflow as tf
import keras

In [2]:
!pip install -q kaggle

In [3]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"elizabethcutting","key":"91e392d5897071ec2f0be851d7a46f0c"}'}

In [4]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle datasets list

ref                                                           title                                            size  lastUpdated          downloadCount  voteCount  usabilityRating  
------------------------------------------------------------  ----------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
rabieelkharoua/students-performance-dataset                   📚 Students Performance Dataset 📚                 66KB  2024-06-12 23:09:20          25126        511  1.0              
waqi786/cars-dataset-2010-2020                                Cars Dataset (2010-2020)                         96KB  2024-07-23 05:49:46           1478         29  1.0              
marius2303/nissan-all-models-price-prediction-dataset         Nissan All Models Price Prediction Dataset      811KB  2024-07-27 11:50:10            799         24  1.0              
ihelon/coffee-sales                                           Coffee Sales                

In [6]:
!kaggle competitions download -c histopathologic-cancer-detection

Downloading histopathologic-cancer-detection.zip to /content
100% 6.30G/6.31G [01:13<00:00, 107MB/s]
100% 6.31G/6.31G [01:13<00:00, 92.3MB/s]


In [7]:
from zipfile import ZipFile

with ZipFile("/content/histopathologic-cancer-detection.zip", 'r') as zObject:
    zObject.extractall()

In [8]:
#this is one way to store the data. While it works, it ends up being slower and messier than a df, so I decided to go with the df object

import os
test_images = []
for img in os.listdir('test'):
    image = keras.utils.load_img("test/" + img)
    input_arr = keras.utils.img_to_array(image)
    # input_arr = np.array([input_arr])
    test_images.append(input_arr)
    break

In [9]:
import pandas as pd
train_labels = pd.read_csv('/content/train_labels.csv')
# train_labels.head()
train_labels.shape

(220025, 2)

In [15]:
# We need to check if there are any data points that are not given a label so we can get rid of them
print(train_labels.isnull().sum())
# We found none

id       0
label    0
dtype: int64


In [10]:
# Ultimately, my computer just can't deal with 200k images, so I have to cut it down.
# Testing with 100k
# But, this means I can make sure I choose the images I want, so I can get a perfect half and half distribution
# This is pretty cool because the dataset isn't even:

x = train_labels['label'].value_counts()
print(x)

# Therefore, I will mask each dataset and pick 10,000 of each to use, then grab those images and turn them into a dataset to run through a CNN
sample_size = 25000

mask0 = train_labels['label']==0
mask1 = train_labels['label']==1

train0 = train_labels[mask0]
train1 = train_labels[mask1]

train = pd.concat([train0.sample(sample_size),train1.sample(sample_size)], axis=0)
train = train.reset_index(drop = True)
train = train.sample(frac=1).reset_index(drop=True) #shuffle

#see the new values we will use
x = train['label'].value_counts()
print(x)


label
0    130908
1     89117
Name: count, dtype: int64
label
0    60000
1    60000
Name: count, dtype: int64


In [11]:
# I will store the converted images in these folders
import os

os.mkdir("train_png")
os.mkdir("train_png/0")
os.mkdir("train_png/1")

In [None]:
# #Just used to fix a previous error since I can't delete files manually in colab. Useful for future mistakes...
# for x in os.listdir("/content/train_png"):
#   os.remove("/content/train_png/"+x)
# # # os.rmdir('/content/train_png/0')
# os.rmdir('/content/train_png')

In [12]:
from PIL import Image

# use the PIL library to convert the image. I need to convert so I can make a tf dataset object
# I want to save it into two separate classes so that the labels can be interpreted by the tf method used below
for img_name in train['id']:
    mask = train['id']==img_name
    if train[mask]['label'].values[0] == 0:
        with Image.open("/content/train/" + img_name + ".tif") as tif:
            png = tif.convert("RGB")
            png.save("/content/train_png/0/" + img_name + ".png")
    else:
        with Image.open("/content/train/" + img_name + ".tif") as tif:
            png = tif.convert("RGB")
            png.save("/content/train_png/1/" + img_name + ".png")


In [13]:
# do the same thing for the test dataset, since it will be used later
from PIL import Image

os.mkdir("test_png")

for img_name in os.listdir('/content/test'):
    with Image.open("/content/test/" + img_name) as tif:
        png = tif.convert("RGB")
        png.save("/content/test_png/" + img_name + ".png")

In [14]:
#follow basic procedure mentioned in the demo nb
%tensorflow_version 2.x
import tensorflow as tf
import keras
import numpy as np



Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [24]:
# check that the directory is prepared!
print(os.listdir("/content/train_png/0")[0])
print(os.listdir("/content/train_png/1")[0])

9d2d43446902ee8b542336209d6dd5d7f9f807be.png
254978742d70439190950018a1bd54259fc33363.png


In [16]:
#decided to keep batch size a bit larger than default. Mainly, I want to not lose out on important time here
train_dataset = tf.keras.utils.image_dataset_from_directory('/content/train_png',label_mode='binary',class_names=['0','1'],image_size=(96,96),color_mode='rgb',seed=123,validation_split=0.18,subset='training',batch_size=64)

val_dataset = tf.keras.utils.image_dataset_from_directory('/content/train_png',label_mode='binary',class_names=['0','1'],image_size=(96,96),color_mode='rgb',seed=123,validation_split=0.18,subset='validation',batch_size=64)

Found 120000 files belonging to 2 classes.
Using 98400 files for training.
Found 120000 files belonging to 2 classes.
Using 21600 files for validation.


In [17]:
from tensorflow.keras import layers, models

batch_size = 64
pool_size = (2,2)
kernel_size = (3,3)

model = tf.keras.Sequential()
model.add(layers.Rescaling(1./255, input_shape=(96,96,3)))
model.add(layers.Conv2D(32, kernel_size, activation='relu'))
model.add(layers.MaxPooling2D(pool_size))
model.add(layers.Conv2D(64, kernel_size, activation='relu'))
model.add(layers.MaxPooling2D(pool_size))
model.add(layers.Conv2D(128, kernel_size, activation='relu'))
model.add(layers.Flatten())
# model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

  super().__init__(**kwargs)


In [18]:
from keras.losses import BinaryCrossentropy
from keras.metrics import Accuracy, Recall
from keras.optimizers import Adam, Adamax, Lion

# lion_optimizer = Lion()

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),optimizer="adam",metrics = ['accuracy', 'BinaryCrossentropy'])

In [19]:
history = model.fit(train_dataset, epochs=5, validation_data=val_dataset)

Epoch 1/5
[1m2050/2050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2333s[0m 1s/step - BinaryCrossentropy: 0.4939 - accuracy: 0.7641 - loss: 0.4939 - val_BinaryCrossentropy: 0.4220 - val_accuracy: 0.8043 - val_loss: 0.4220
Epoch 2/5
[1m2050/2050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2363s[0m 1s/step - BinaryCrossentropy: 0.4044 - accuracy: 0.8198 - loss: 0.4044 - val_BinaryCrossentropy: 0.3769 - val_accuracy: 0.8340 - val_loss: 0.3769
Epoch 3/5
[1m2050/2050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2361s[0m 1s/step - BinaryCrossentropy: 0.3549 - accuracy: 0.8448 - loss: 0.3549 - val_BinaryCrossentropy: 0.3502 - val_accuracy: 0.8478 - val_loss: 0.3502
Epoch 4/5
[1m2050/2050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2351s[0m 1s/step - BinaryCrossentropy: 0.3035 - accuracy: 0.8717 - loss: 0.3035 - val_BinaryCrossentropy: 0.3468 - val_accuracy: 0.8581 - val_loss: 0.3468
Epoch 5/5
[1m2050/2050[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2389s[0m 1s/step - B

In [20]:
test_dataset = tf.keras.utils.image_dataset_from_directory('/content/test_png',
                                                            label_mode=None,
                                                            image_size=(96,96),
                                                            shuffle=False,
                                                            batch_size=1)

Found 57458 files.


In [21]:
predictions = model.predict(test_dataset)



[1m57458/57458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m656s[0m 11ms/step


In [22]:
test = os.listdir("/content/test_png")
all_pred = pd.DataFrame(columns=['id', 'label'])
test=sorted(test)
all_pred['id'] = [img_id.split('.')[0] for img_id in test]
all_pred['label'] = np.round(predictions.flatten()).astype('int')


import csv
l = []
with open('/content/sample_submission.csv', 'r') as f:
    reader = csv.reader(f)
    flag = True
    for row in reader:
        if flag:
            flag = False
            continue
        else:
            l.append(row[0])

    submission = all_pred[all_pred['id'].isin(l)]
    submission.to_csv("submission_test.csv", header=True, index=False)


In [None]:
print(history)

<keras.src.callbacks.History object at 0x7b8b371ba6b0>
