In [2]:
import os
import pandas as pd
import numpy as np
import bson
import cv2
import random
from tqdm import tqdm
import struct
from PIL import Image
from  torchvision import transforms as transf
import torch.utils.data as data_utils
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torchvision.models as models

In [3]:
def read_bson(bson_path, with_categories, num_input):
    """
    Reads BSON
    """
    offset = 0
    rows = {}
    with open(bson_path, "rb") as f, tqdm(total=num_input) as pbar:
        f.seek(offset)
        records_read = 0
        while True:
            item_length_bytes = f.read(4)
            if len(item_length_bytes) == 0:
                break

            length = struct.unpack("<i", item_length_bytes)[0]
            f.seek(offset)
            item_data = f.read(length)
            assert len(item_data) == length

            item = bson.BSON.decode(item_data)
            product_id = item["_id"]
            num_imgs = len(item["imgs"])

            row = [num_imgs, offset, length]
            if with_categories:
                row += [item["category_id"]]
            rows[product_id] = row

            offset += length
            f.seek(offset)
            records_read += 1
            pbar.update()
        pbar.close()
    columns = ["num_imgs", "offset", "length"]
    if with_categories:
        columns += ["category_id"]

    df = pd.DataFrame.from_dict(rows, orient="index")
    df.index.name = "product_id"
    df.columns = columns
    df.sort_index(inplace=True)
    return df


def make_category_tables(categories_path):
    """
    Converts category name into an index [0, N-1]
    """
    categories_df = pd.read_csv(categories_path, index_col="category_id")
    categories_df["category_idx"] = pd.Series(range(len(categories_df)), index=categories_df.index)

    cat2idx = {}
    idx2cat = {}
    for ir in categories_df.itertuples():
        category_id = ir[0]
        category_idx = ir[4]
        cat2idx[category_id] = category_idx
        idx2cat[category_idx] = category_id
    return cat2idx, idx2cat


def get_obs(fname, offset, length):
    fobj = open(fname, 'rb')
    fobj.seek(offset)
    res = bson.BSON.decode(fobj.read(length))
    fobj.close()
    return res

In [4]:
class CdiscountDataset(data_utils.Dataset):
    def __init__(self, dataset, split, transform):
        self.dataset = dataset
        self.metadata = split
        self.transform = transform

    def __getitem__(self, index):
        entry = self.metadata.iloc[index]
        num_imgs, offset, length = entry
        obs = get_obs(self.dataset, offset, length)
        keep = np.random.choice(len(obs['imgs']))
        byte_str = obs['imgs'][keep]['picture']
        img = cv2.imdecode(np.fromstring(byte_str, dtype=np.uint8), cv2.IMREAD_COLOR)
        img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        img = self.transform(img)

        return img

    def __len__(self):
        return self.metadata.index.values.shape[0]

In [5]:
# read in test data.
TEST_NUM = 1768182
TEST_BSON_FILE = '/home/joe/term/input/test.bson'

meta_data = read_bson(TEST_BSON_FILE, with_categories=False, num_input=TEST_NUM)

100%|██████████| 1768182/1768182 [01:02<00:00, 28076.80it/s]


In [7]:
CATEGS = '/home/joe/term/input/category_names.csv'

# mapping the catigores into 0-5269 range
cat2idx, idx2cat = make_category_tables(CATEGS)

In [8]:
BS = 3072 # Batch size

test_dataset = CdiscountDataset(TEST_BSON_FILE, meta_data, transf.Compose([
        transf.Resize(224),
        transf.ToTensor(),
        transf.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]))
test_loader = data_utils.DataLoader(test_dataset, batch_size=BS, shuffle=False, pin_memory=True, num_workers = 4)

In [9]:
# Create Resnet18 Class.
resnet18 = models.resnet18()

NUM_CATEGORIES = 5270 # TOTAL NUMBER OF CATEGORIES of this classification task.

# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = resnet18.fc.in_features
resnet18.fc = torch.nn.Linear(num_ftrs, NUM_CATEGORIES)

for param in resnet18.parameters():
    param.requires_grad = False
    
resnet18 = torch.nn.DataParallel(resnet18, device_ids=[0,1,2,3]).cuda() # SET MODEL TO GPU MODE.

In [10]:
# Load trained parameters.
resnet18.load_state_dict(torch.load('/home/joe/term/term/pytorch/models/resnet18_fc6_6epoch.pkl'))

In [11]:
resnet18.eval() # set model to evaluation mode.

DataParallel(
  (module): ResNet(
    (conv1): Conv2d (3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
    (relu): ReLU(inplace)
    (maxpool): MaxPool2d(kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), dilation=(1, 1))
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        (relu): ReLU(inplace)
        (conv2): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d (64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True)
        (relu): ReLU(inplace)
        (conv2): Conv2d (64, 64, kernel_size=(3, 3), str

In [9]:
# Let's run the model on the Test-set.

evaluations = np.array([])

with tqdm(total = TEST_NUM) as pbar:
    for batch in test_loader:
        images = Variable(batch, volatile=True).cuda()
        outputs = resnet18(images)
        _, predicted = torch.max(outputs.data, 1)
        evaluations = np.concatenate((evaluations, predicted.cpu().numpy()), axis=0)
        pbar.update(batch.size(0))
        pass
    pass
pass

100%|██████████| 1768182/1768182 [46:34<00:00, 641.08it/s]


In [10]:
meta_data['category_id'] = pd.Series(evaluations.astype(int), index=meta_data.index)
final_result = meta_data.drop(['num_imgs', 'offset', 'length'], axis=1)
final_result.index.names = ['_id']
final_result['category_id'] = final_result['category_id'].map(idx2cat)
final_result.to_csv('/home/joe/term/output/resnet18_fc_6epoch.csv')