In [20]:
import json
import io
import os
from google.cloud import vision 
from google.cloud.vision import types
from google.oauth2 import service_account
from sklearn.feature_extraction import DictVectorizer

import pandas as pd

In [21]:
credentials = service_account.Credentials.from_service_account_file('api_keys.json')
vision_client = vision.ImageAnnotatorClient(credentials = credentials)  

categories= ['automotive','beauty','sport','perfumes','shoes','military','woman','man','kids','digital_electronics']
image_path = ".\Images"
json_path = ".\logs"

In [22]:
def extract_url_from_json(json_file_path):
    """Finds url from json file of image"""
    
    with open(json_file_path) as json_file:  
        data = json.load(json_file)
    
    urls = []
    for image_data in data:
        urls.append(image_data['image_link'])
        
    return urls

def detect_labels(path):
    """Detects labels in the file."""
    global vision_client
    
    with io.open(path, 'rb') as image_file:
        content = image_file.read()

    image = vision.types.Image(content=content)

    response = vision_client.label_detection(image=image)
    labels = response.label_annotations
    labels_list = ""
    #print('Labels:')

    for label in labels:
        labels_list = labels_list + " " + label.description#.append(label.description)
        #print(label.description)
    return labels_list


def detect_labels_url(url):
    """Detects labels in the file located in Google Cloud Storage or on the Web."""
    global vision_client
    
    image = vision.types.Image()
    image.source.image_uri = url

    response = vision_client.label_detection(image=image)
    labels = response.label_annotations
    labels_list =""
    #print('Labels:')

    for label in labels:
        labels_list = labels_list + " " + label.description#.append(label.description)
        #print(label.description)
    return labels_list

def detect_labels_scores(url):
    """Detects labels and scores on the Web."""
    global vision_client
    
    image = vision.types.Image()
    image.source.image_uri = url

    response = vision_client.label_detection(image=image)
    labels = response.label_annotations
    labels_dict = {}

    for label in labels:
        labels_dict[label.description] = label.score
    return labels_dict

def detect_objects_scores(url):
    """Detects labels and scores on the Web."""
    global vision_client
    
    image = vision.types.Image()
    image.source.image_uri = url

    response = vision_client.object_localization(image=image)
    objects = response.localized_object_annotations
    objects_dict = {}

    for obj in objects:
        objects_dict[obj.name] = obj.score
    return objects_dict

def get_data_images(categories):
    """Creates Data Frame from all the data"""
    global image_path
    
    df = pd.DataFrame(columns = ["path", "labels", "category"])
    for category in categories:
        all_images_path = [os.path.join(image_path + "\\" + category,f) for f in os.listdir(image_path + "\\" + category) 
                           if f.endswith('.jpg')]
        for img_path in all_images_path:
            label_list = detect_labels(img_path)
            df2 = pd.DataFrame([[img_path, label_list, category]],columns = ["path", "labels", "category"])
            df = df.append(df2, ignore_index=True)
        print(category + " done")

    return df

def get_data_json(categories):
    """Creates Data Frame from json files through url"""
    global json_path
    
    df = pd.DataFrame(columns = ["path", "labels", "category"])
    for category in categories:
        all_category_url = extract_url_from_json(json_path + "\\" + category + ".json")
        for image_url in all_category_url:
            label_list = detect_labels_url(image_url)
            df2 = pd.DataFrame([[image_url, label_list, category]],columns = ["path", "labels", "category"])
            df = df.append(df2, ignore_index=True)
        print(category + " done")
    return df

def get_datascores_json(categories):
    """Creates Data Frame from json files through url"""
    global json_path
    
    paths = []
    label_list = []
    y_labels = []
    for category in categories:
        all_category_url = extract_url_from_json(json_path + "\\" + category + ".json")
        for image_url in all_category_url:
            #label_dict = detect_labels_scores(image_url)
            object_dict = detect_objects_scores(image_url)
            #print(image_url)
            #df2 = pd.DataFrame([[image_url, label_list, category]],columns = ["path", "labels", "category"])
            #df = df.append(df2, ignore_index=True)
            paths.append(image_url)
            #label_list.append({**label_dict, **object_dict})
            label_list.append(object_dict)
            y_labels.append(category)
        print(category + " done")
    v = DictVectorizer(sparse=False)
    vectorized_labels = v.fit_transform(label_list)
    df = pd.DataFrame(data = vectorized_labels, columns = v.feature_names_)
    df["paths"] = paths
    df["category"] = y_labels
    return df

In [23]:
df = get_datascores_json(categories)

automotive done
beauty done
sport done
perfumes done
shoes done
military done
woman done
man done
kids done
digital_electronics done


In [24]:
df.head()

Unnamed: 0,Animal,Backpack,Bag,Baked goods,Ball,Balloon,Baseball glove,Basketball,Bear,Bicycle,...,Top,Toy,Truck,Vegetable,Weapon,Wheel,Window,Woman,paths,category
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://cdn.shopify.com/s/files/1/1330/0815/pr...,automotive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.synopsys.com/content/dam/synopsys/...,automotive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://www.nexusautomotiveinternational.eu/wp...,automotive
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://upload.wikimedia.org/wikipedia/commons...,automotive
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,https://appwebradar.com/wp-content/uploads/201...,automotive


In [25]:
# delete zero rows
temp = df.drop(["paths", "category"], axis = 1)

In [26]:
new_df = df[(temp.T != 0).any()]

In [29]:
new_df.to_csv("DataScoreObjects100.csv")