In [18]:
import os
import io
import pandas as pd
import numpy as np

# Viz imports
import matplotlib.pyplot as plt

# Image processing imports
from PIL import Image
from google.cloud import vision
import cv2

In [2]:
# Authenticate client 
client = vision.ImageAnnotatorClient.from_service_account_json('/Users/nahiapeschard/code/fresh-office-384710-40d49749ad5a.json')

# Loading and splitting data

In [5]:
X_train = pd.read_csv("../../raw_data/train_x.csv", delimiter=" ", index_col="Id")

In [16]:
X_train.shape

(51300, 2)

In [6]:
X_train.head()

Unnamed: 0_level_0,Image_name,Title
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1588345297.jpg,With Schwarzkopf: Life Lessons of The Bear
1,1404803335.jpg,"Magnets: Pulling Together, Pushing Apart (Amaz..."
2,1446276082.jpg,Energy Security (SAGE Library of International...
3,1491522666.jpg,An Amish Gathering: Life in Lancaster County
4,0970096410.jpg,City of Rocks Idaho: A Climber's Guide (Region...


In [19]:
X_train_chunks = np.split(X_train, 100)

In [58]:
X_test = pd.read_csv("../../raw_data/test_x.csv", delimiter=" ", index_col="Id")

In [59]:
X_test.shape

(5700, 2)

In [60]:
X_test.head()

Unnamed: 0_level_0,Image_name,Title
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,044310073X.jpg,Oral and Maxillofacial Surgery: An Objective-B...
1,1438005687.jpg,"Barron's GRE, 21st Edition"
2,0060750715.jpg,George Balanchine: The Ballet Maker (Eminent L...
3,1580237959.jpg,"A Partner in Holiness: Deepening Mindfulness, ..."
4,0135137829.jpg,Construction Scheduling: Principles and Practi...


In [61]:
X_test_chunks = np.split(X_test, 10)

# API calls

In [30]:
def detect_labels(path):
    output_labels = []
    """Detects labels in the file."""
    from google.cloud import vision

    client = vision.ImageAnnotatorClient()

    with io.open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.label_detection(image=image)
    labels = response.label_annotations

    for label in labels:
        output_labels.append(label.description)

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )
    return output_labels

In [57]:
# Create an empty DataFrame to store the extracted text
for i in range(17, 100):
    print(f"Processing batch {i}")
    data = []
    X_train_chunks_i = X_train_chunks[i].reset_index()
    for j in range(len(X_train_chunks_i)):
        image_path = os.path.join("../../raw_data/Images/", X_train_chunks_i['Image_name'][j])
        labels = detect_labels(image_path)
        data.append({'Image_name': X_train_chunks_i['Image_name'][j], 'Google_labels': labels})

    df = pd.DataFrame(data)
    df.to_csv(f"../../raw_data/Labels_csv/X_train_batch_{i}.csv")

Processing batch 17
Processing batch 18
Processing batch 19
Processing batch 20
Processing batch 21
Processing batch 22
Processing batch 23
Processing batch 24
Processing batch 25
Processing batch 26
Processing batch 27
Processing batch 28
Processing batch 29
Processing batch 30
Processing batch 31
Processing batch 32
Processing batch 33
Processing batch 34
Processing batch 35
Processing batch 36
Processing batch 37
Processing batch 38
Processing batch 39
Processing batch 40
Processing batch 41
Processing batch 42
Processing batch 43
Processing batch 44
Processing batch 45
Processing batch 46
Processing batch 47
Processing batch 48
Processing batch 49
Processing batch 50
Processing batch 51
Processing batch 52
Processing batch 53
Processing batch 54
Processing batch 55
Processing batch 56
Processing batch 57
Processing batch 58
Processing batch 59
Processing batch 60
Processing batch 61
Processing batch 62
Processing batch 63
Processing batch 64
Processing batch 65
Processing batch 66


In [62]:
# Create an empty DataFrame to store the extracted text
for i in range(10):
    print(f"Processing batch {i}")
    data = []
    X_test_chunks_i = X_test_chunks[i].reset_index()
    for j in range(len(X_test_chunks_i)):
        image_path = os.path.join("../../raw_data/Images/", X_test_chunks_i['Image_name'][j])
        labels = detect_labels(image_path)
        data.append({'Image_name': X_test_chunks_i['Image_name'][j], 'Google_labels': labels})

    df = pd.DataFrame(data)
    df.to_csv(f"../../raw_data/Labels_csv/X_test_batch_{i}.csv")

Processing batch 0
Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Processing batch 5
Processing batch 6
Processing batch 7
Processing batch 8
Processing batch 9


In [67]:
X_test_google_labels = pd.read_csv("../../raw_data/Labels_csv/X_test_batch_0.csv", index_col=0)

for i in range(1, 10):
    X_test_google_labels_temp = pd.read_csv(f"../../raw_data/Labels_csv/X_test_batch_{i}.csv", index_col=0)
    X_test_google_labels = pd.concat([X_test_google_labels, X_test_google_labels_temp], axis=0)

X_test_google_labels = X_test_google_labels.reset_index()

(5700, 3)


Unnamed: 0,index,Image_name,Google_labels
0,0,044310073X.jpg,"['Liquid', 'Font', 'Automotive lighting', 'Geo..."
1,1,1438005687.jpg,"['Product', 'Font', 'Publication', 'Recipe', '..."
2,2,0060750715.jpg,"['Sleeve', 'Tie', 'Gesture', 'Font', 'Book', '..."
3,3,1580237959.jpg,"['Font', 'Poster', 'Event', 'Advertising', 'El..."
4,4,0135137829.jpg,"['Computer', 'Personal computer', 'Laptop', 'F..."


In [69]:
X_test_google_labels = X_test_google_labels.drop(columns=['index'])
print(X_test_google_labels.shape)
X_test_google_labels.tail()

(5700, 2)


Unnamed: 0,Image_name,Google_labels
5695,0399174680.jpg,"['Sky', 'Building', 'Property', 'Cloud', 'Plan..."
5696,1939454654.jpg,"['Azure', 'Font', 'Aqua', 'Book', 'Electric bl..."
5697,1604601604.jpg,"['Building', 'Property', 'Window', 'Wheel', 'F..."
5698,0399166726.jpg,"['Tire', 'Product', 'Font', 'Automotive tire',..."
5699,0062073508.jpg,"['Poster', 'Font', 'Publication', 'Advertising..."


In [70]:
X_test_google_labels.to_csv(f"../../raw_data/Labels_csv/X_test_google_labels.csv")

In [71]:
X_train_google_labels = pd.read_csv("../../raw_data/Labels_csv/X_train_batch_0.csv", index_col=0)

for i in range(1, 100):
    X_train_google_labels_temp = pd.read_csv(f"../../raw_data/Labels_csv/X_train_batch_{i}.csv", index_col=0)
    X_train_google_labels = pd.concat([X_train_google_labels, X_train_google_labels_temp], axis=0)

X_train_google_labels = X_train_google_labels.reset_index()
X_train_google_labels = X_train_google_labels.drop(columns=['index'])

In [72]:
print(X_train_google_labels.shape)
X_train_google_labels.tail()

(51300, 2)


Unnamed: 0,Image_name,Google_labels
51295,0399176055.jpg,"['Amber', 'Font', 'Publication', 'Automotive e..."
51296,0719816629.jpg,"['Font', 'Rectangle', 'Electric blue', 'Signag..."
51297,0385353596.jpg,"['Liquid', 'Drinkware', 'Bottle', 'Fluid', 'Gl..."
51298,1475988559.jpg,"['Atmosphere', 'Windmill', 'Sky', 'Wind farm',..."
51299,045147483X.jpg,"['Motor vehicle', 'Publication', 'Poster', 'Ve..."


In [73]:
X_train_google_labels.to_csv(f"../../raw_data/Labels_csv/X_train_google_labels.csv")

# Concatenating

In [45]:
df_concat = pd.read_csv("../../raw_data/Labels_csv/X_train_batch_0.csv")
for i in range(1, 10):
    X_train_batch_temp = pd.read_csv(f"../../raw_data/Labels_csv/X_train_batch_{i}.csv")
    df_concat = pd.concat([df_concat, X_train_batch_temp], axis=0)

In [50]:
df_concat = df_concat.reset_index()

In [53]:
df_concat = df_concat.drop(columns=['index', 'Unnamed: 0'])

In [54]:
df_concat.head()

Unnamed: 0,Image_name,Google_labels
0,1588345297.jpg,"['Publication', 'Font', 'Book', 'Terrestrial p..."
1,1404803335.jpg,"['Automotive lighting', 'Font', 'Rectangle', '..."
2,1446276082.jpg,"['Rectangle', 'Book', 'Font', 'Publication', '..."
3,1491522666.jpg,"['Plant', 'Font', 'Publication', 'Book', 'Rect..."
4,0970096410.jpg,"['Cloud', 'Sky', 'World', 'Natural landscape',..."


In [55]:
df_concat.shape

(5130, 2)

In [56]:
df_concat.to_csv(f"../../raw_data/Labels_csv/X_train_batch_10%.csv")