<a href="https://colab.research.google.com/github/HarshiniKomali/SharpEye/blob/main/item_to_item/item_MF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Item to Item MF for outfit recommendation
The workflow will be as follows
1. Convert all items into a scene.
2. Pass the scene into a MF factorization that is inverted to output an item
3. return top 10 items

## Update the item to recommend

In [1]:
#test shirt = d4ad28d585788409ab2774b3780b6ce9
source_item = "d4ad28d585788409ab2774b3780b6ce9"


## Prerequisite
This is to create the dictionary that will convert a item into a scene

In [2]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.2 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633995 sha256=3144186ffb698f6413628910d1981ab7d09599f32fa5c7b885552d6b3adf0905
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [3]:
#function to convert ID to URL (stolen from data source)
def convert_to_url(signature):
    prefix = 'http://i.pinimg.com/400x/%s/%s/%s/%s.jpg'
    return prefix % (signature[0:2], signature[2:4], signature[4:6], signature)

In [4]:
import csv
import copy

with open('fashion.csv', 'r') as f:
    
    csv_reader = csv.DictReader(f)
    list_of_items = list(csv_reader)

scene_list = {}

for item in list_of_items:
    productID = item.get('ProductID') 
    
    #if the product key exist
    if productID in scene_list:
        buffer = scene_list[productID]
        buffer.append(item.get('SceneID').strip())

    #if the product key is missing
    else:
        scene_list[productID] = [item.get('SceneID').strip()]

master_scene_list = copy.deepcopy(scene_list)


In [5]:
#this is for the items dictionary
import csv
with open('working_category.csv', mode='r', encoding='utf-8-sig') as f:
    
    category_reader = csv.DictReader(f)
    category_of_items = list(category_reader)

category_list = {}

for item in category_of_items:
    category = item.get('Scene') 
    
    #if the product key exist
    if category in category_list:
        buffer = category_list[category]
        buffer.append(item.get('ProductID').strip())

    #if the product key is missing
    else:
        category_list[category] = [item.get('ProductID').strip()]

In [6]:
#to create a translation table for item to item type
item_to_category_dictionary = {}
for row in category_of_items:
  item_to_category_dictionary[row["ProductID"]] = row['Scene']

In [7]:
import pandas
import surprise
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from tables.tests import test_suite
from collections import defaultdict

import numpy

## Loading the dataset into memory

In [8]:
#import the data
raw_core = pandas.read_csv('fashion.csv')

#cleaning the data of whitespace
raw_core["ProductID"] = raw_core["ProductID"].str.strip()
raw_core["SceneID"] = raw_core["SceneID"].str.strip()

In [9]:
#adding dummy rating for sorting
raw_core["Rating"] = 1

## Adding Obfuscation Information
need to add random zero values to prevent the matrix from just zeroing out.

In [10]:
# query
obfuscation_percentage = 0.30
obfuscation_cycles = 3

# need to define a function that will add 
def obfuscate(sample_frame, percentage, cycles):
  for i in range(0, cycles):
    #np arrays
    unique_products = sample_frame["ProductID"].unique()
    unique_scenes = raw_core["SceneID"].unique()

    #panda dataframes
    df_uproducts = pandas.DataFrame(unique_products)
    df_uscenes = pandas.DataFrame(unique_scenes)

    #getting only a subset of the dataframes to work with
    product_subset = df_uproducts.sample(frac = obfuscation_percentage)

    #print(product_subset)
    for item in product_subset[0]:
      #getting a random scene as a DF
      random_scene_df = df_uscenes.sample()

      # converting to string
      random_scene = random_scene_df.iloc[0][0]

      #Getting the list from the dictionary to check against
      buffer_scenes = scene_list[item]

      #Check if item is there, if add it
      if random_scene in buffer_scenes:
        #this case is only if the item exist as a pair already
        print("Collision PID:" + item + ": SID:" + random_scene + ":")

      else:
        #print("before update " + str(scene_list[item]))
        #add scene to dictionary
        scene_list[item].append(random_scene.strip())

        #print("after update " + str(scene_list[item]))
        #add scene to dataframe as zero
        raw_core.loc[len(raw_core.index)] = [item, random_scene, 0]

In [11]:
#this will generate the new array based upon the item name.
#1. get the item class
item_class = item_to_category_dictionary[source_item]

#2. create the dataframe with the sample subset
list_of_values = category_list[item_class]
subset_df = raw_core[raw_core['ProductID'].isin(list_of_values)]


obfuscate(subset_df, obfuscation_percentage, obfuscation_cycles)

## Loading the subsets into memory

In [12]:
reader = Reader(rating_scale=(0,1))
surprise_subset = Dataset.load_from_df(subset_df,reader)

## Training and Predicting the models


In [14]:
#training the shirt model
subset_model = surprise.SVD(n_factors=10,reg_all=0.05, n_epochs=30)
suprirse_data = surprise_subset.build_full_trainset()
subset_model.fit(suprirse_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f1addc1dd10>

This function is to return the first x items that have a perfect hit for matches.



In [15]:
def first_hit_list(algo, length, ):

  #prereq getting a unique productIDs
  unique_products = subset_df["ProductID"].unique()

  #getting the source item list of scenes
  list_of_scenes = master_scene_list[source_item]

  #buffer to hold the output values
  output_dataframe = pandas.DataFrame(columns = ['ProductID', 'r_value'])

  for scene in list_of_scenes:
    for product in unique_products:
      if product != source_item:
        output = algo.predict(product, scene)

        output_dataframe = output_dataframe.append({'ProductID' : output[0], 'r_value': output[3]}, ignore_index = True)

  output_dataframe = output_dataframe.sort_values(by='r_value', ascending=False)
  top_ten_df = output_dataframe.head(10)
  print(top_ten_df)

  return top_ten_df['ProductID'].tolist()

In [16]:
top_ten = first_hit_list(subset_model, 10)


                             ProductID  r_value
5925  2f01a6600295076a0cf7b81e2c6797e1      1.0
2896  ffbdff62e62baeb77410a25d3e6ed56a      1.0
6759  79340d2048d98e809e0279126cd42b40      1.0
2880  bfd4b50c6a8864a828f998394a91e3f8      1.0
6757  33a0811b752f4abf19b2c2bc91d23c9d      1.0
2883  c6cc34ecc78c88df270b03c0a07074bd      1.0
6756  32b91d24ad941175e70cc0999c8369bb      1.0
2886  cc93803cf90e3903d14c0b3b39cfd1a8      1.0
2887  d4669c61b054328ec87e6a21e79b5954      1.0
6135  da04c6dca6c453293602fd6a156bd470      1.0


## Results

In [17]:
print("orginal item " + convert_to_url(source_item))

for item in top_ten:
  print(convert_to_url(item))

orginal item http://i.pinimg.com/400x/d4/ad/28/d4ad28d585788409ab2774b3780b6ce9.jpg
http://i.pinimg.com/400x/2f/01/a6/2f01a6600295076a0cf7b81e2c6797e1.jpg
http://i.pinimg.com/400x/ff/bd/ff/ffbdff62e62baeb77410a25d3e6ed56a.jpg
http://i.pinimg.com/400x/79/34/0d/79340d2048d98e809e0279126cd42b40.jpg
http://i.pinimg.com/400x/bf/d4/b5/bfd4b50c6a8864a828f998394a91e3f8.jpg
http://i.pinimg.com/400x/33/a0/81/33a0811b752f4abf19b2c2bc91d23c9d.jpg
http://i.pinimg.com/400x/c6/cc/34/c6cc34ecc78c88df270b03c0a07074bd.jpg
http://i.pinimg.com/400x/32/b9/1d/32b91d24ad941175e70cc0999c8369bb.jpg
http://i.pinimg.com/400x/cc/93/80/cc93803cf90e3903d14c0b3b39cfd1a8.jpg
http://i.pinimg.com/400x/d4/66/9c/d4669c61b054328ec87e6a21e79b5954.jpg
http://i.pinimg.com/400x/da/04/c6/da04c6dca6c453293602fd6a156bd470.jpg


In [None]:
#For random ID to picutre
ID_to_convert = 'f1861731387291383b3a1ea65aa2789b'

print(convert_to_url(ID_to_convert))

http://i.pinimg.com/400x/f1/86/17/f1861731387291383b3a1ea65aa2789b.jpg
