# **Image Data**

In [None]:
import numpy as np
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from collections import Counter
import re
from PIL import Image
import torchvision
from torchvision import transforms

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile

In [None]:
zip_file_path = "/content/drive/MyDrive/Datas/kagglecatsanddogs_5340.zip" # Replace with the actual path to your zip file
extraction_directory = '/content/' # Replace with the directory where you want to extract

In [None]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(extraction_directory)

In [None]:
import tarfile

In [None]:
tar_gz_file_path = "/content/drive/MyDrive/Datas/aclImdb_v1.tar.gz" # Replace with the actual path to your .tar.gz file
extraction_directory = '/content/' # Replace with the directory where you want to extract

In [None]:
with tarfile.open(tar_gz_file_path, 'r:gz') as tar_ref:
    tar_ref.extractall(extraction_directory)

In [None]:
import os
path = "/content/PetImages/"
os.listdir(path)

['Dog', 'Cat']

In [None]:
class DogvsCats(Dataset):
  def __init__(self, path_to_folder):
    path_to_cat = os.path.join(path_to_folder, "Cat")
    path_to_dog = os.path.join(path_to_folder, "Dog")

    cat_files = os.listdir(path_to_cat)
    dog_files = os.listdir(path_to_dog)

    path_to_cat_files = [os.path.join(path_to_cat, file) for file in cat_files]
    path_to_dog_files = [os.path.join(path_to_dog, file) for file in dog_files]

    self.training_files = path_to_dog_files + path_to_cat_files

    self.dog_label = 0
    self.cat_label = 1

    self.transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

  def __len__(self):
    return len(self.training_files)

  def __getitem__(self, idx):
    path_to_image = self.training_files[idx]

    if "Dog" in path_to_image:
      label = self.dog_label
    else:
      label = self.cat_label

    image = Image.open(path_to_image).convert('RGB')
    image = self.transform(image)

    return image, label

In [None]:
dataset = DogvsCats(path_to_folder=path)

In [None]:
for image, label in dataset:
  print(image.shape)
  print(label)
  break

torch.Size([3, 224, 224])
0


In [None]:
dogvscatloader = DataLoader(dataset, batch_size = 16, shuffle = True)

In [None]:
for images, label in dogvscatloader:
  print(images.shape)
  print(label)
  break

torch.Size([16, 3, 224, 224])
tensor([1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1])


In [None]:
num_train_samples = int(0.9 * len(dataset))
num_test_samples = len(dataset) - num_train_samples

In [None]:
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [num_train_samples, num_test_samples])

In [None]:
len(train_dataset), len(test_dataset)

(22501, 2501)

In [None]:
dogvscatloader_train = DataLoader(train_dataset, batch_size = 16, shuffle = True)
dogvscatloader_test = DataLoader(test_dataset, batch_size=16, shuffle = False)

In [None]:
from torchvision.datasets import ImageFolder

In [None]:
dataset = ImageFolder(path)

In [None]:
dataset

Dataset ImageFolder
    Number of datapoints: 25000
    Root location: /content/PetImages/

In [None]:
dataset.classes

['Cat', 'Dog']

# **Text Data**

In [None]:
path_to_data = "/content/aclImdb/train"

In [None]:
os.listdir(path_to_data)

['neg',
 'unsupBow.feat',
 'urls_unsup.txt',
 'pos',
 'labeledBow.feat',
 'unsup',
 'urls_neg.txt',
 'urls_pos.txt']

In [None]:
path_to_pos_fld = os.path.join(path_to_data, "pos")
path_to_neg_fld = os.path.join(path_to_data, "neg")

In [None]:
path_to_pos_text = [os.path.join(path_to_pos_fld, file) for file in os.listdir(path_to_pos_fld)]
path_to_neg_text = [os.path.join(path_to_neg_fld, file) for file in os.listdir(path_to_neg_fld)]

In [None]:
from tqdm.notebook import tqdm

all_text = ""
training_files = path_to_pos_text + path_to_neg_text
for file in tqdm(training_files):
  with open(file, 'r') as f:
    text = f.readline()
    all_text += text[0]

  0%|          | 0/25000 [00:00<?, ?it/s]

In [None]:
# Example text
Counter("Hello world welcome to programming with Python")

Counter({'H': 1,
         'e': 3,
         'l': 4,
         'o': 6,
         ' ': 6,
         'w': 3,
         'r': 3,
         'd': 1,
         'c': 1,
         'm': 3,
         't': 3,
         'p': 1,
         'g': 2,
         'a': 1,
         'i': 2,
         'n': 2,
         'h': 2,
         'P': 1,
         'y': 1})

In [None]:
{key for key, value in Counter("Hello world welcome to programming with Python").items() if value != 1}

{' ', 'e', 'g', 'h', 'i', 'l', 'm', 'n', 'o', 'r', 't', 'w'}

### **Tokenizer**

In [None]:
unique_counts = dict(Counter(all_text))
characters = sorted([key for (key, value) in unique_counts.items()])

In [None]:
characters.append("<UNK>")
characters.append("<PAD>")

In [None]:
char2idx = {char: idx for idx, char in enumerate(characters)}
idx2char = {idx: char for idx, char in enumerate(characters)}

In [None]:
# idx2char

In [None]:
class IMDBDataset(Dataset):
  def __init__(self, path_to_data, char2idx):
    path_to_pos_fld = os.path.join(path_to_data, "pos")
    path_to_neg_fld = os.path.join(path_to_data, "neg")

    path_to_pos_text = [os.path.join(path_to_pos_fld, file) for file in os.listdir(path_to_pos_fld)]
    path_to_neg_text = [os.path.join(path_to_neg_fld, file) for file in os.listdir(path_to_neg_fld)]

    self.training_files = path_to_pos_text + path_to_neg_text
    self.tokenizer = char2idx

    self.pos_label = 1
    self.neg_label = 0

  def __len__(self):
    return len(self.training_files)

  def __getitem__(self, idx):
    path_to_text = self.training_files[idx]

    with open(path_to_text, 'r') as f:
      text = f.readline()

    tokenized = []
    for char in text:
      if char in self.tokenizer.keys():
        tokenized.append(self.tokenizer[char])
      else:

        tokenized.append(self.tokenizer["<UNK>"])

    sample = torch.tensor(tokenized)
    label = self.pos_label if "pos" in path_to_text else self.neg_label

    return sample, label

path_to_data = "/content/aclImdb/train"
dataset = IMDBDataset(path_to_data, char2idx)
# len(dataset)

# counter = 0
# for sample, label in dataset:
#   print(len(sample))

#   counter += 1
#   if counter ==5:
#     break

In [None]:
dataloader = DataLoader(dataset, batch_size = 4)

In [None]:
for sample, label in dataloader:
  print(sample.shape)
  print(label)
  break

RuntimeError: stack expects each tensor to be equal size, but got [621] at entry 0 and [883] at entry 1

In [None]:
a = torch.ones(10)
b = torch.ones(8)
c = torch.ones(2)

padded = nn.utils.rnn.pad_sequence([a, b, c], padding_value = 999, batch_first = True)

In [None]:
padded.shape

torch.Size([3, 10])

In [45]:
padded

tensor([[  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.,   1.],
        [  1.,   1.,   1.,   1.,   1.,   1.,   1.,   1., 999., 999.],
        [  1.,   1., 999., 999., 999., 999., 999., 999., 999., 999.]])

In [48]:
def data_collater(batch):
  texts, labels = [], []
  for text, label in batch:
    texts.append(text)
    labels.append(label)

  labels = torch.tensor(labels)
  texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value = char2idx["<PAD>"])
  return texts, labels

In [49]:
loader = DataLoader(dataset, batch_size = 4, collate_fn = data_collater)

In [55]:
for sample, label in loader:
  print(sample[0])
  break

tensor([33, 82, 76, 55, 72, 82, 73, 71, 59, 55, 73, 59, 58, 82, 73, 68, 82, 55,
        82, 75, 63, 59, 76, 63, 67, 61, 82, 68, 60, 82, 27, 71, 55, 57, 82, 59,
        71, 82, 26, 55, 61, 82, 65, 55, 72, 73, 82, 67, 63, 61, 62, 73, 82, 56,
        59, 60, 68, 71, 59, 82, 55, 82, 69, 71, 59, 75, 63, 59, 76, 82, 72, 57,
        71, 59, 59, 67, 63, 67, 61, 82, 68, 60, 82, 28, 63, 72, 67, 59, 78,  5,
        72, 82, 32, 68, 65, 59, 72,  9, 82, 33, 82, 58, 68, 67,  5, 73, 82, 82,
        67, 68, 76, 82, 76, 62, 68, 82, 58, 59, 57, 63, 58, 59, 58, 82, 73, 68,
        82, 72, 62, 68, 76, 82, 63, 73, 82, 56, 74, 73, 82, 33,  5, 66, 82, 72,
        68, 82, 75, 59, 71, 78, 82, 61, 65, 55, 58, 82, 73, 62, 59, 78, 82, 58,
        63, 58,  9, 82, 27, 71, 55, 57, 82, 59, 71, 82, 26, 55, 61, 82, 63, 72,
        82, 55, 67, 82, 55, 56, 72, 68, 65, 74, 73, 59, 82, 61, 59, 66, 82, 82,
        55, 82, 72, 67, 55, 69, 72, 62, 68, 73, 82, 68, 60, 82, 25, 74, 72, 73,
        71, 55, 65, 63, 55, 82, 63, 67, 