In [1]:
!pip list

Package                Version
---------------------- --------------
async-generator        1.10
attrs                  20.3.0
backcall               0.2.0
bleach                 3.2.1
certifi                2020.11.8
chardet                3.0.4
cycler                 0.10.0
decorator              4.4.2
defusedxml             0.6.0
dill                   0.3.3
entrypoints            0.3
idna                   2.10
imageio                2.9.0
importlib-metadata     3.3.0
ipykernel              5.4.2
ipython                7.19.0
ipython-genutils       0.2.0
ipywidgets             7.5.1
jedi                   0.17.2
Jinja2                 2.11.2
joblib                 0.17.0
jsonschema             3.2.0
jupyter-client         6.1.7
jupyter-core           4.7.0
jupyterlab-pygments    0.1.2
kiwisolver             1.3.1
MarkupSafe             1.1.1
matplotlib             3.3.3
mistune                0.8.4
nbclient               0.5.1
nbconvert              6.0.7
nbformat               5.0

In [2]:
!python -m pip install -U scikit-image

Requirement already up-to-date: scikit-image in /opt/venv/lib/python3.7/site-packages (0.18.1)
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
#Setup

#Import statements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from skimage.io import imread

#Plt
plt.style.use('seaborn-whitegrid')

# Set Matplotlib defaults
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=18, titlepad=10)
plt.rc('animation', html='html5')

In [4]:
#File paths and names of data
!ls /datasets/ebay-data

mlchallenge_set_2021.tsv  mlchallenge_set_validation.tsv


In [5]:
cNames =  ["Category", "Primary Image", "Other Images", "Attributes", "Index"]
dataset = pd.read_csv('/datasets/ebay-data/mlchallenge_set_2021.tsv',sep = '\t', header=None, names = cNames)

In [6]:
dataset.head(5)

Unnamed: 0,Category,Primary Image,Other Images,Attributes,Index
0,2,https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYA...,https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYA...,"(Brand:Shimano,US Shoe Size (Men's):4.5,Modifi...",0
1,2,https://i.ebayimg.com/00/s/MTA1OFgxMTM0/z/KPIA...,https://i.ebayimg.com/00/s/MTA1OFgxMTM0/z/KPIA...,"(Color:Gray/White,Country/Region of Manufactur...",1
2,2,https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/flIA...,https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/flIA...,"(Style:Cleats,Color:White Orange,US Shoe Size ...",2
3,2,http://i.ebayimg.com/00/s/ODAwWDEwNjc=/z/XHcAA...,http://i.ebayimg.com/00/s/ODAwWDEwNjc=/z/XHcAA...,"(Width:Medium (D, M),US Size:9,Brand:VANS,Colo...",3
4,2,https://i.ebayimg.com/00/s/MTA2N1gxNjAw/z/scsA...,https://i.ebayimg.com/00/s/MTA2N1gxNjAw/z/scsA...,"(US Shoe Size (Men's):10.5,Material:Enter item...",4


In [7]:
dataset.set_index('Index', inplace=True)
dataset.head(5)

Unnamed: 0_level_0,Category,Primary Image,Other Images,Attributes
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYA...,https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYA...,"(Brand:Shimano,US Shoe Size (Men's):4.5,Modifi..."
1,2,https://i.ebayimg.com/00/s/MTA1OFgxMTM0/z/KPIA...,https://i.ebayimg.com/00/s/MTA1OFgxMTM0/z/KPIA...,"(Color:Gray/White,Country/Region of Manufactur..."
2,2,https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/flIA...,https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/flIA...,"(Style:Cleats,Color:White Orange,US Shoe Size ..."
3,2,http://i.ebayimg.com/00/s/ODAwWDEwNjc=/z/XHcAA...,http://i.ebayimg.com/00/s/ODAwWDEwNjc=/z/XHcAA...,"(Width:Medium (D, M),US Size:9,Brand:VANS,Colo..."
4,2,https://i.ebayimg.com/00/s/MTA2N1gxNjAw/z/scsA...,https://i.ebayimg.com/00/s/MTA2N1gxNjAw/z/scsA...,"(US Shoe Size (Men's):10.5,Material:Enter item..."


In [8]:
cols = dataset.columns.tolist()
cols 

['Category', 'Primary Image', 'Other Images', 'Attributes']

In [9]:
cols = cols[1:] + [cols[0]]
cols

['Primary Image', 'Other Images', 'Attributes', 'Category']

In [10]:
dataset = dataset[cols]
dataset.head(5)

Unnamed: 0_level_0,Primary Image,Other Images,Attributes,Category
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYA...,https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYA...,"(Brand:Shimano,US Shoe Size (Men's):4.5,Modifi...",2
1,https://i.ebayimg.com/00/s/MTA1OFgxMTM0/z/KPIA...,https://i.ebayimg.com/00/s/MTA1OFgxMTM0/z/KPIA...,"(Color:Gray/White,Country/Region of Manufactur...",2
2,https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/flIA...,https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/flIA...,"(Style:Cleats,Color:White Orange,US Shoe Size ...",2
3,http://i.ebayimg.com/00/s/ODAwWDEwNjc=/z/XHcAA...,http://i.ebayimg.com/00/s/ODAwWDEwNjc=/z/XHcAA...,"(Width:Medium (D, M),US Size:9,Brand:VANS,Colo...",2
4,https://i.ebayimg.com/00/s/MTA2N1gxNjAw/z/scsA...,https://i.ebayimg.com/00/s/MTA2N1gxNjAw/z/scsA...,"(US Shoe Size (Men's):10.5,Material:Enter item...",2


In [11]:
#Function to parse attributes, to be tested

import re

def parse_attributes(attrib):
    #Remove parentheses
    attrib = re.sub('[()]', '', attrib)
    
    #Split by colon(s))
    attrib = re.split(':+', attrib)
    
    #Inner comma split
    lta = len(attrib) #Variable to hold length of attributes
    # ["Colors", "blue,white,Special Note", "very nice,Style", "Modern"]
    
    for i in range(lta):
        attrib[i] = attrib[i].split(',')
    
    #Lists to hold terms
    nonLastTerms = []
    lastTerms = []
    
    #Split into terms
    for i in range(lta):
        if len(attrib[i]) == 1 and i < lta - 1:
            lastTerms.append(attrib[i][0])
        elif len(attrib[i]) == 1 and i == lta - 1:
            nonLastTerms.append(attrib[i][-1])
        elif len(attrib[i]) > 1:
            lastTerms.append(attrib[i][-1])
            nonLastTerms.append(attrib[i][:-1])
        
    #Make combine the list values and make the dictionary
    for i in range(len(nonLastTerms)):
        nonLastTerms[i] = (",".join(nonLastTerms[i]) if type(nonLastTerms[i]) is list else nonLastTerms[i])
        
    attrib = dict(zip(lastTerms, nonLastTerms))
    
    return attrib
    

In [12]:
#Testing attribute parsing

mod_dataset = pd.DataFrame()
mod_dataset['Attributes'] = dataset['Attributes'].apply(lambda x: parse_attributes(x))
mod_dataset.head(20)
# mod_dataset.iloc[0]

Unnamed: 0_level_0,Attributes
Index,Unnamed: 1_level_1
0,"{'Brand': 'Shimano', 'US Shoe Size Men's': '4...."
1,"{'Color': 'Gray/White', 'Country/Region of Man..."
2,"{'Style': 'Cleats', 'Color': 'White Orange', '..."
3,"{'Width': 'Medium D, M', 'US Size': '9', 'Bran..."
4,"{'US Shoe Size Men's': '10.5', 'Material': 'En..."
5,"{'US Shoe Size Men's': '7', 'Brand': 'AMA BRAN..."
6,"{'Style': 'Athletic Sneakers', 'US Shoe Size M..."
7,"{'Style': 'Skateboarding', 'Brand': 'Nike', 'U..."
8,"{'Brand': 'Brooks', 'Material': 'Mesh / Leathe..."
9,"{'US Shoe Size Men's': '11', 'Brand': 'Ralph L..."


In [13]:
# untruncated dataframe

pd.set_option('display.max_colwidth', -1)
mod_dataset.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Attributes
Index,Unnamed: 1_level_1
0,"{'Brand': 'Shimano', 'US Shoe Size Men's': '4.5', 'Modified Item': 'No', 'Style': 'Cycling'}"
1,"{'Color': 'Gray/White', 'Country/Region of Manufacture': 'Indonesia', 'US Shoe Size Men's': '8', 'Style': 'Athletic Sneakers', 'Brand': 'Giorgio Armani', 'Material': 'Leather Suede'}"
2,"{'Style': 'Cleats', 'Color': 'White Orange', 'US Shoe Size Men's': '15', 'Brand': 'Nike'}"
3,"{'Width': 'Medium D, M', 'US Size': '9', 'Brand': 'VANS', 'Color': 'Green', 'Material': 'Suede & Denim', 'Style': 'Skateboarding', 'Type': 'Athletic', 'Type 2': 'Skateboarding', 'US Shoe Size Men's': '9'}"
4,"{'US Shoe Size Men's': '10.5', 'Material': 'Enter item specific valuePLEASE READ BEFORE BIDDIN', 'Brand': 'Nike', 'Style': 'Basketball Shoes', 'Color': 'Multi-Color'}"


In [14]:
# import re
# #Experimenting

# sample_attributes = '(Colors:blue, white,Special Note::very nice,Style: Modern)'
# sample_attributes = re.sub('[()]', '', sample_attributes)
# print("parenthesis:",sample_attributes)

# sample_attributes = re.split(':+', sample_attributes)
# print("colon split:",sample_attributes)

# lsa = len(sample_attributes)
# for i in range(len(sample_attributes)):
#     sample_attributes[i] = sample_attributes[i].split(',') 
# print("Inner comma split:", sample_attributes)

# #Lists to hold terms
# nonLastTerms = []
# lastTerms = []
    
# #Split into terms

# for i in range(lsa):
#     if len(sample_attributes[i]) == 1 and i < lsa - 1:
#         lastTerms.append(sample_attributes[i][0])
#     elif len(sample_attributes[i]) == 1 and i == lsa - 1:
#         nonLastTerms.append(sample_attributes[i][-1])
#     elif len(sample_attributes[i]) > 1:
#         lastTerms.append(sample_attributes[i][-1])
#         nonLastTerms.append(sample_attributes[i][:-1])


# print("lastTerms:", lastTerms)
# print("nonLastTerms:", nonLastTerms)
# for i in range(len(nonLastTerms)):
#     nonLastTerms[i] = (",".join(nonLastTerms[i]) if type(nonLastTerms[i]) is list else nonLastTerms[i])

# print("nonLastTerms:", nonLastTerms)

# sample_attributes = dict(zip(lastTerms, nonLastTerms))

# print("Combining terms:", sample_attributes)






In [15]:
#Function to turn images to arrays

# def im2Array(url):
#     pass


In [16]:
#Experimentation

sample_url = 'https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYAAOxydgZTJwYc/$_1.JPG?set_id=880000500F'

# imArray = imread(sample_url)
# print(imArray.shape) #400 x 300 image in RGB (3) == (400, 300, 3), This is 3D
# imArray2 = np.reshape(imArray, (1,-1))
# print(imArray2.shape) #1 x 360000 reshaped image, This is 2D
# imgplot = plt.imshow(imArray)


Image to array

In [17]:
# imports to turn image to array
from PIL import Image
import requests
from io import BytesIO
from skimage.transform import resize

# imports for imagenet models
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.imagenet_utils import decode_predictions

from tensorflow.keras.applications import (
        vgg16,
        resnet50,
        mobilenet,
        inception_v3,
        nasnet,
        xception
)

In [18]:
# Function to turn images to array - Bryan
# def imgToArray(url):
#     # get image from url
#     response = requests.get(url)
#     original = Image.open(BytesIO(response.content))

#     # convert from PIL to numpy array
#     numpy_image = img_to_array(original)

#     # resize image to (224, 224, 3) -> models expect this input
#     numpy_image = resize(numpy_image, (224, 224))

#     # add another dimensions to the image -> models expects 4th batch dimension
#     image_batch = np.expand_dims(numpy_image, axis=0)

#     return image_batch

In [19]:
# sample_url = 'https://i.ebayimg.com/00/s/MTYwMFgxMjAw/z/iYYAAOxydgZTJwYc/$_1.JPG?set_id=880000500F'

# print(imgToArray(sample_url))

Image-net models

In [20]:
# init the models
vgg_model = vgg16.VGG16(weights='imagenet')
inception_model = inception_v3.InceptionV3(weights='imagenet')
resnet_model = resnet50.ResNet50(weights='imagenet')
mobilenet_model = mobilenet.MobileNet(weights='imagenet')
nasnetlarge_model = nasnet.NASNetLarge(weights='imagenet')
xception_model = xception.Xception(weights='imagenet')


## Model Dimensions Sheet

**_VGG:_ (None, 224, 224, 3)**  
**_Inception:_ (None, 299, 299, 3)**  
**_Resnet:_ (None, 224, 224, 3)**  
**_Mobilenet:_ (None, 224, 224, 3)**  
**_NASNetLarge:_ (None, 331, 331, 3)**  
**_Xception:_ (None, 299, 299, 3)**  


In [21]:
#Function to predict features of pictures from URL and then return the predicted features in a list of arrays
def predict_image_label(url):
    #get image from url

    response = requests.get(url)
    original = Image.open(BytesIO(response.content))

    # convert from PIL to numpy array
    numpy_image = img_to_array(original)

    # resize image to (224, 224, 3) or (331,331) or (299,299) -> models expect these inputs
    numpy_image_2242243 = resize(numpy_image, (224, 224))
    numpy_image_3313313 = resize(numpy_image, (331, 331))
    numpy_image_2992993 = resize(numpy_image, (299, 299))

    # add another dimensions to the image -> models expects 4th batch dimension
    image_batch_2242243 = np.expand_dims(numpy_image_2242243, axis=0)
    image_batch_3313313 = np.expand_dims(numpy_image_3313313, axis=0)
    image_batch_2992993 = np.expand_dims(numpy_image_2992993, axis=0)
    
    #Hold predictions of each model
    predictions = []

    # vgg expects (None, 224, 224, 3)
    vgg_processed_image = vgg16.preprocess_input(image_batch_2242243.copy())
    vgg_predictions = resnet_model.predict(vgg_processed_image)
    # vgg_predicted_labels = decode_predictions(vgg_predictions)
    predictions.append(vgg_predictions.flatten())

    # inception model expects (None, 299, 299, 3)
    inception_processed_image = inception_v3.preprocess_input(image_batch_2992993.copy())
    inception_predictions = inception_model.predict(inception_processed_image)
    # print("inception type: ", type(inception_predictions))
    # inception_predicted_labels = decode_predictions(inception_predictions)
    predictions.append(inception_predictions.flatten())

    # resnet model expects (None, 224, 224, 3)
    resnet_processed_image = resnet50.preprocess_input(image_batch_2242243.copy())
    resnet_predictions = resnet_model.predict(resnet_processed_image)
    # print("resnet type: ", type(resnet_predictions))
    # resnet_predicted_labels = decode_predictions(resnet_predictions)
    predictions.append(resnet_predictions.flatten())

    # mobilenet model expects (None, 224, 224, 3)
    mobilenet_processed_image = mobilenet.preprocess_input(image_batch_2242243.copy())
    mobilenet_predictions = mobilenet_model.predict(mobilenet_processed_image)
    # print("mobilenet type: ", type(mobilenet_predictions))
    # mobilenet_predicted_labels = decode_predictions(mobilenet_predictions)
    predictions.append(mobilenet_predictions.flatten())

    # NASNetLarge model expects (None, 331, 331, 3)
    nasnetlarge_preprocessed_image = nasnet.preprocess_input(image_batch_3313313.copy())
    nasnetlarge_predictions = nasnetlarge_model.predict(nasnetlarge_preprocessed_image)
    # print("nasnetlarge type: ", type(nasnetlarge_predictions))
    # nasnetlarge_predicted_labels = decode_predictions(nasnetlarge_predictions)
    predictions.append(nasnetlarge_predictions.flatten())

    # Xception model expects (None, 299, 299, 3)
    xception_preprocessed_image = xception.preprocess_input(image_batch_2992993.copy())
    xception_predictions = xception_model.predict(xception_preprocessed_image)
    # print("xception type: ", type(xception_predictions))
    # xception_predicted_labels = decode_predictions(xception_predictions)
    predictions.append(xception_predictions.flatten())

    return predictions

In [22]:
# 'bedsheet' image
# predict_image_label(imgToArray('https://i.ebayimg.com/00/s/Nzc2WDgzMg==/z/jyQAAOSwBydbno6x/$_57.JPG?set_id=8800005007'))
# predict_image_label('https://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/GSQAAOSw9m5b9cT6/$_57.JPG?set_id=8800005007')
# predict_image_label(imgToArray('https://i.ebayimg.com/00/s/NjUwWDY1MA==/z/eR0AAOSwp91ckSjP/$_1.JPG?set_id=880000500F'))

In [23]:
#Function to add predicted features of original dataset images to mod_dataset
def addPredictedFeatures(pdDataframe):
    """Adds predicted features into a new dataframe
    
    Takes in an original dataframe with image urls, goes through each url, and 
    adds the predicted labels (from predict_image_label()) to the original 
    dataframe.
    """

    #Iterate through the primary image urls and get the flattened features
    #Add the predicted features array to the dataframe when done
    primary_features = []
    for index, row in pdDataframe.iterrows():
        if pd.isnull(row['Primary Image']):
            primary_features.append([np.nan])
        else:
            primary_features.append(predict_image_label(row['Primary Image']))

    pdDataframe['Primary_Features'] = primary_features


    #Iterate over the other image urls and get the flattened features for each
    #Add the array of predicted feature arrays to the dataframe when done
    otherI_features = []
    for index, row in pdDataframe.iterrows():
        if pd.isnull(row['Primary Image']):
            otherI_features.append([np.nan])
        else:
            otherI_features.append([predict_image_label(x) for x in row['Other Images']])

    pdDataframe['OtherI_Features'] = otherI_features

    
    #Explode the column with the other image features
    pdDataframe = pdDataframe.explode('OtherI_Features')
    
    
    return pdDataframe



In [25]:
# copy of mod_dataframe to add features to dataframe 

f_dataframe = dataset.copy() # f stands for features
f_dataframe['Primary Image'] = f_dataframe['Primary Image'].astype(str)

# addPredictedFeatures(f_dataframe.head())
for index, row in f_dataframe.iterrows():
    print(row['Other Images'])



https://i.ebayimg.com/00/s/MTYwMFgxMjIw/z/E7sAAOSwsV1dMOrI/$_57.JPG?set_id=8800005007;https://i.ebayimg.com/00/s/MTYwMFgxMjIw/z/9GoAAOSwyqddMOrG/$_57.JPG?set_id=8800005007;https://i.ebayimg.com/00/s/MTYwMFgxMjIw/z/MxkAAOSwi6tdMOrF/$_57.JPG?set_id=8800005007;https://i.ebayimg.com/00/s/MTYwMFgxNjAw/z/-v0AAOSwr6hdMWmH/$_12.JPG?set_id=880000500F;https://i.ebayimg.com/00/s/MTYwMFgxNjAw/z/K~8AAOSwMXZdMWno/$_12.JPG?set_id=880000500F;https://i.ebayimg.com/00/s/MTYwMFgxNjAw/z/-TkAAOSwW8ZdMWnm/$_12.JPG?set_id=880000500F;https://i.ebayimg.com/00/s/MTYwMFgxNjAw/z/oXoAAOSwLo9dMWmZ/$_12.JPG?set_id=880000500F;https://i.ebayimg.com/00/s/MTYwMFgxNjAw/z/N-wAAOSw3LxdMWnB/$_12.JPG?set_id=880000500F
https://i.ebayimg.com/00/s/NTMzWDU0OQ==/z/OWIAAOSwPl1dMapj/$_57.PNG?set_id=8800005007;https://i.ebayimg.com/00/s/NTM0WDU5MA==/z/6k0AAOSwP6hdMapg/$_57.PNG?set_id=8800005007;https://i.ebayimg.com/00/s/NTMzWDYwMA==/z/4hgAAOSwgYpdMapc/$_57.PNG?set_id=8800005007
https://i.ebayimg.com/00/s/MTYwMFgxNjAw/z/9N0AAOSwORdc

```python
df = pd.DataFrame(columns=['Name', 'Age'])
 
df.loc[1, 'Name'] = 'Rocky'
df.loc[1, 'Age'] = 23
```
---

```python
features_df = pd.DataFrame(columns=['Features'])

# key & features_pred variable come from the function
df.loc[key, 'Features'] = features_pred

# some pandas merging df function here --> features_df merged onto mod_dataset
```

In [None]:
# from sklearn import cluster

# # X =
# agglo = cluster.FeatureAgglomeration(n_clusters=5, affinity='manhattan')

In [None]:
#Turning Other Images into list of arrays
#The same thing we do for primary images but it's gonna be for all the images


In [None]:
#Full Custom Transformer, should include all preprocess steps as functions class functions

In [None]:
#Code skeleton for evaluation 

#Some function that takes in the predicted labels and ground truth labels
# def whatever (predictions, groundTruth)
    
    #Get the precision score with sklearn.metrics.precision_score

    #Get the recall score with sklearn.metrics.recall_score

    #Get the f1 score with sklearm.metrics.f1_score

    #Return the f1 score
