In [1]:
import os
import pandas as pd
import numpy as np
import bson
import cv2
import random
from tqdm import tqdm
import struct
from PIL import Image
from  torchvision import transforms as transf
import torch.utils.data as data_utils
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torchvision.models as models

In [2]:
def read_bson(bson_path, with_categories, input_count):
    """
    Reads BSON
    """
    offset = 0
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=input_count) as pbar:
        f.seek(offset)
        records_read = 0
        while True:
            item_length_bytes = f.read(4)
            if len(item_length_bytes) == 0:
                break

            length = struct.unpack("<i", item_length_bytes)[0]
            f.seek(offset)
            item_data = f.read(length)
            assert len(item_data) == length

            item = bson.BSON.decode(item_data)
            product_id = item["_id"]
            num_imgs = len(item["imgs"])

            row = [num_imgs, offset, length]
            if with_categories:
                row += [item["category_id"]]
            rows[product_id] = row

            offset += length
            f.seek(offset)
            records_read += 1
            pbar.update()
        pbar.close()
    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"]

    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    df.sort_index(inplace=True)
    return df

def make_category_tables(categories_path):
    """
    Converts category name into an index [0, N-1]
    """
    categories_df = pd.read_csv(categories_path, index_col="category_id")
    categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat


def get_obs(fname, offset, length):
    fobj = open(fname, 'rb')
    fobj.seek(offset)
    res = bson.BSON.decode(fobj.read(length))
    fobj.close()
    return res

In [3]:
class CdiscountDataset(data_utils.Dataset):
    def __init__(self, dataset, split, transform):
        self.dataset = dataset
        self.metadata = split
        self.transform = transform

    def __getitem__(self, index):
        entry = self.metadata.iloc[index]
        num_imgs, offset, length, target = entry
        obs = get_obs(self.dataset, offset, length)
        keep = np.random.choice(len(obs['imgs']))
        byte_str = obs['imgs'][keep]['picture']
        img = cv2.imdecode(np.fromstring(byte_str, dtype=np.uint8), cv2.IMREAD_COLOR)
        img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        img = self.transform(img)

        return img, target

    def __len__(self):
        return self.metadata.index.values.shape[0]

In [4]:
# LOAD PRETRAINED MODEL.
resnet152 = models.resnet152(pretrained=True)

In [5]:
NUM_CATEGORIES = 5270 # TOTAL NUMBER OF CATEGORIES of this classification task.

# WHEN FINE-TUNING, set param.requires_grad = False.
for param in resnet152.parameters():
    param.requires_grad = False

# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = resnet152.fc.in_features
resnet152.fc = torch.nn.Linear(num_ftrs, NUM_CATEGORIES)

resnet152 = torch.nn.DataParallel(resnet152, device_ids=[0,1,2,3]).cuda() # SET MODEL TO GPU MODE.

In [6]:
print(num_ftrs)

2048


In [6]:
# Loss, Optimizer, LR Scheduler.

criterion = nn.CrossEntropyLoss()

# Observe that only parameters of final layer are being optimized as
# opoosed to before.
optimizer = torch.optim.SGD(resnet152.module.fc.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [7]:
TRAIN_BSON_FILE = '/home/joe/term/input/train.bson'
TEST_BSON_FILE = '/home/joe/term/input/test.bson'
CATEGS = '/home/joe/term/input/category_names.csv'
N_TRAIN = 7069896 # number of items in train.bson

# mapping the catigores into 0-5269 range
cat2idx, idx2cat = make_category_tables(CATEGS)

# Dataset and loader

# Scanning the metadata
meta_data = read_bson(TRAIN_BSON_FILE, with_categories=True, input_count = N_TRAIN)
meta_data.category_id = np.array([cat2idx[ind] for ind in meta_data.category_id])

100%|██████████| 7069896/7069896 [01:38<00:00, 72064.44it/s]


In [8]:
temp = np.arange(N_TRAIN)
np.random.shuffle(temp)

train_sample = temp[0:1100000] # CONTROL SAMPLE SIZE FROM TRAINING-SET. 
val_sample = temp[7000000:N_TRAIN]

train_data = meta_data.iloc[train_sample]
val_data = meta_data.iloc[val_sample]

print(type(train_data), train_data.shape, val_data.shape)

training_category = np.unique(np.array(train_data.category_id))
val_category = np.unique(np.array(val_data.category_id))

both = np.intersect1d(training_category, val_category)

both_ratio = 1.0 * len(both)/len(val_category)

print(both_ratio)

<class 'pandas.core.frame.DataFrame'> (1100000, 4) (69896, 4)
0.9992264053635895


In [9]:
N_THREADS = 4
BS = 128

train_dataset = CdiscountDataset(TRAIN_BSON_FILE, train_data, transf.Compose([
    transf.RandomCrop(160),
    transf.Resize(224),
    transf.ColorJitter(0.15, 0.15, 0.15, 0.15),
    transf.RandomHorizontalFlip(),
    transf.ToTensor(),
    transf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]))
loader = data_utils.DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=N_THREADS, pin_memory = True)

val_dataset = CdiscountDataset(TRAIN_BSON_FILE, val_data, transf.Compose([
    transf.Resize(224),
    transf.ToTensor(),
    transf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]))
val_loader = data_utils.DataLoader(val_dataset, batch_size=BS, shuffle=False, pin_memory = True)

In [10]:
EPOCH = 3
cnt = 0
# Let's go fetch some data!
torch.backends.cudnn.benchmark = True
for epoch in range(EPOCH):
    pbar = tqdm(total=len(loader))
    for i, (batch, target) in enumerate(loader):
        exp_lr_scheduler.step()
        # Convert torch tensor to Variable
        images = Variable(batch.cuda())
        labels = Variable(target.cuda())

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = resnet152(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        pbar.update()
    pbar.close()

    # Test the Model
    resnet152.eval() # simply set module to evaluation mode.
    correct = 0
    total = 0
    for batch, target in val_loader:
        images = Variable(batch, volatile=True).cuda()
        outputs = resnet152(images)
        _, predicted = torch.max(outputs.data, 1)
        total += target.size(0)
        correct += (predicted.cpu() == target).sum()
    resnet152.train() # Simply set module back to training mode.
    print('%dth accuracy of the network on the %d test images: %f %%' % (epoch+1, len(val_loader), (100.0 * correct / total)))
    torch.save(resnet34.state_dict(), './models/resnet152_fc_{}epoch.pkl'.format(epoch+1))
    pass
pass

100%|██████████| 8594/8594 [9:10:36<00:00,  3.58s/it]  


1th accuracy of the network on the 547 test images: 0.004292 %


NameError: name 'resnet34' is not defined