In [1]:
import pandas as pd
import json
import os
import cv2 as cv
from tqdm import tqdm
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
import pickle
import glob
import shutil

In [2]:
questions = pd.DataFrame(json.load(open("./v2_OpenEnded_mscoco_train2014_questions.json", "r"))["questions"])
annotations = pd.DataFrame(json.load(open("./v2_mscoco_train2014_annotations.json", "r"))["annotations"])
compairs = json.load(open("./v2_mscoco_train2014_complementary_pairs.json"))

In [12]:
questions.head()

Unnamed: 0,image_id,question,question_id
0,458752,What is this photo taken looking through?,458752000
1,458752,What position is this man playing?,458752001
2,458752,What color is the players shirt?,458752002
3,458752,Is this man a professional baseball player?,458752003
4,262146,What color is the snow?,262146000


In [15]:
annotations.head()

Unnamed: 0,question_type,multiple_choice_answer,answers,image_id,answer_type,question_id
0,what is this,net,"[{'answer': 'net', 'answer_confidence': 'maybe...",458752,other,458752000
1,what,pitcher,"[{'answer': 'pitcher', 'answer_confidence': 'y...",458752,other,458752001
2,what color is the,orange,"[{'answer': 'orange', 'answer_confidence': 'ye...",458752,other,458752002
3,is this,yes,"[{'answer': 'yes', 'answer_confidence': 'yes',...",458752,yes/no,458752003
4,what color is the,white,"[{'answer': 'white', 'answer_confidence': 'yes...",262146,other,262146000


In [3]:
def extractImageFeatures(imgpath, count = -1, NUM_CLUSTERS = 10):
    if os.path.exists("imgFeatures.pkl"):
        return pickle.load(open("imgFeatures.pkl", "rb"))
    
    imgfiles = os.listdir(imgpath)[1:]
    if count < 0:
        count = len(imgfiles)
    orb = cv.ORB_create()
    descriptors = []
    for i in tqdm(range(count)):
        img = cv.imread(imgpath + imgfiles[i])
        gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        k, d = orb.detectAndCompute(gray, None)
        descriptors.append(d)
        
    features = np.vstack(descriptors)
    print(f"Doing kmeans on {features.shape} points...")
    kmeansDescriptors = KMeans(n_clusters = NUM_CLUSTERS, random_state = 42).fit(features)
    print(f"Kmeans done")
    
    sm = 0
    def computeHistogram(idx):
        nonlocal sm
        histogram = [0]*NUM_CLUSTERS
        for d in descriptors[idx]:
            label = kmeansDescriptors.labels_[sm]
            sm += 1
            histogram[label] += 1
        norm = np.linalg.norm(histogram)
        if norm == 0:
            return histogram
        return histogram / norm
    
    imgFeatures = []
    for i in tqdm(range(count)):
        imgFeatures.append(computeHistogram(i))
        
    pickle.dump(imgFeatures, open("imgFeatures.pkl", "wb"))
    return imgFeatures

In [10]:
def IdtoFile(imageId):
    imageId = str(imageId)
    return f"COCO_train2014_000000{imageId}.jpg"

def FiletoId(filename):
    return filename.split("/")[-1][-10:-4]

def responseWeight(confidence):
    if confidence == "yes":
        return 10
    elif confidence == "maybe" :
        return 7
    return 4

In [11]:
def createRandomSample(p : float = 0.01):
    sample = set()
    imgfiles = os.listdir('./../RawImages/images/')[1:]
    for img in imgfiles:
        if np.random.choice([0, 1], p = [1 - p, p]):
            sample.add(FiletoId(img))
    return sample

In [21]:
sample = createRandomSample(0.26)

In [22]:
pickle.dump(sample, open("training_sample.pkl", "wb"))

In [23]:
len(sample)

21549

In [30]:
def createDataset(datapath, imageset):
    files = glob.glob(datapath + "*")
    for f in files :
        os.remove(f)
    print(f"Erased contents of the {datapath} directory.")
    
    print(f"Building the data dict...")
    dict = {}
    answer_confidences = set()
    labels = []
    df = pd.merge(questions, annotations, on = "question_id")
    for i, row in tqdm(df.iterrows()):
        if i > 100:
            break
        imageId = str(row["image_id_x"])
        if imageId not in imageset:
            continue
        questionText = row["question"]
        answer = row["multiple_choice_answer"]
        for response in row["answers"]:
            labels.append(response["answer"])
        
        if imageId in dict:
            dict[imageId].append({
                "question" : questionText,
                "answer" : answer,
                "responses" : row["answers"]
            })
        else : 
            dict[imageId] = [{
                "question" : questionText,
                "answer" : answer,
                "responses" : row["answers"]
            }]
    labelEncoder = LabelEncoder()
    labelEncoder.fit(labels)
    C = len(labelEncoder.classes_)
    labelmap = {}
    for i in range(C):
        labelmap[labelEncoder.classes_[i]] = i
    print(f"Label Encoder contains {C} classes.")
    
    print(f"Building the metadata object and moving files...")
    metadata = []
        
    for imageId in tqdm(dict.keys()): 
        fileName = IdtoFile(imageId)
        qna = []
        for q in dict[imageId]:
            questionText = q["question"]
            answer = q["answer"]
            answerlabel = labelmap[answer]
            # softvec = {}
            # for response in q["responses"]:
            #     label = labelmap[response["answer"]]
            #     if label in softvec:
            #         softvec[label] += responseWeight(response["answer_confidence"])
            #     else:
            #         softvec[label] = responseWeight(response["answer_confidence"])
            qna.append({
                "question" : questionText,
                "answer" : answer,
                "answerlabel" : answerlabel,
                # "softvec" : softvec
            })
        shutil.copyfile(f"../RawImages/images/{fileName}", f"./train/{fileName}")
        metadata.append({
            "file_name" : fileName,
            "qna" : qna 
        })
        with open("./train/metadata.jsonl", "w") as f:
            for item in metadata:
                json.dump(item, f)
                f.write('\n')
    
        with open("labelEncoder.pkl", "wb") as f:
            pickle.dump(labelEncoder, f)
            
    return labelEncoder, metadata

In [31]:
labelEncoder, metadata = createDataset(
    ".\\train\\",
    sample
)

Erased contents of the .\train\ directory.
Building the data dict...


101it [00:00, 288.12it/s]


Label Encoder contains 36 classes.
Building the metadata object and moving files...


100%|██████████| 4/4 [00:00<00:00, 120.53it/s]
