# Code and Comments

In [None]:
import cv2
import string
import urllib.request
import numpy as np
import pandas as pd
from tqdm import tqdm


from datasketch import MinHash, MinHashLSH
from keras.applications.vgg19 import VGG19
from keras.applications.vgg19 import preprocess_input
from nltk import FreqDist, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = VGG19(weights='imagenet', include_top=False)
table = str.maketrans(dict.fromkeys(string.punctuation))
image_feature = {}
LSH_RESULTS = 'LSH File.txt'
LSH_IMAGE_RESULTS= 'Lsh_Similar_images.txt'
PROCESSED_GROUPS = 'Processed groups.txt'

In [None]:
def ngrams(text, n=3):
    """Return n-gram of a given text"""
    n_grams = zip(*[text[i:] for i in range(n)])
    return [''.join(ngram) for ngram in n_grams]

In [None]:
def download_image(img_url):
    """Downloads image of the given the url or returns
    the features of the image if the image is already downloaded
    Returns a black image in case of an exception
    """
    
    try:
        return image_feature[img_url]
    except KeyError:
        try:
            url_response = urllib.request.urlopen(img_url, timeout=30)
            img_array = np.array(bytearray(url_response.read()), dtype=np.uint8)
            img = cv2.imdecode(img_array, -1)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
            img = np.dstack((img, img, img))
            img = cv2.resize(img, (224, 224))
            return img
        except Exception:
            return np.zeros((224, 224, 3))

def extract_features(img_url, img):
    """Returns features of a given image using Visual Geometry Group 19 """
    try:
        return image_feature[img_url]
    except KeyError:
        img_data = np.expand_dims(img, axis=0)
        img_data = preprocess_input(img_data)
        feature = model.predict(img_data)
        image_feature[img_url] = feature
    return feature

def compare_images(image_url, ids, other_urls):
    """Compares an image with a given set of images and return the ids that match a certain threshold
    Input is the main image url, set of comparision ids and corresponding urls
    
    Uses download_image to download images and extract_features to extract image feature vectors
    """
    image = download_image(image_url)
    other_images = [(url, download_image(url)) for url in other_urls]
    main_img = np.expand_dims(extract_features(image_url, image).flatten(), axis=0)
    other_images = [extract_features(url, img).flatten() for url, img in other_images]
    if not other_images:
        return []
    cosine_mat = cosine_similarity(main_img, other_images)
    ids = np.array(ids)
    return ids[np.argwhere(cosine_mat > 0.45)].flatten().tolist()

In [None]:
def apply_lsh(group, col):
    """
    Apply LSH to group of column and return the lsh object and a dictionary with key as the productId and minhash as the value
    """
    lsh = MinHashLSH(threshold=0.9, num_perm=256)
    minhashes = {}
    for idx, text in group[col].iteritems():
        minhash = MinHash(num_perm=256)
        for d in ngrams(text, 3):
            minhash.update("".join(d).encode('utf-8'))
        index = group.loc[idx, 'productId']
        lsh.insert(key=index, minhash=minhash)
        minhashes[index] = minhash
    return lsh, minhashes

In [None]:
def word_frequency(column):
    "Return frequency dictionary of all words in a column"
    word_list = []
    for row in column:
        word_list += word_tokenize(row)
    return FreqDist(word_list)

In [None]:
def remove_freq_words(text, frequent_words):
    "Remove words from the text using using the frequent words list"
    edited_text = ' '.join([word for word in text.split() if word.lower() not in frequent_words])
    return edited_text

In [None]:
def find_all_duplicates(group):
    """This function is applied to every group of productBrand and sub_category1 and compares the textual data and image data
    to find duplicates"""
    object_cols = ['key_specs_text', 'description', 'title']
    similar_images = {}
    duplicate_dict = {}

    print(f'{group.name}: {group.shape}')

    #Remove punctuation
    for col in object_cols:
        group[col] = group[col].fillna('').str.lower().str.translate(table)

    #Concatenate Text
    group['full_text'] = group['key_specs_text'].astype(str) + ' ' + group['description'].astype(str) + ' ' + group[
        'title'].astype(str)
    group['imageUrl'] = group['imageUrl'].fillna('')
    
    
    word_freq = word_frequency(group['full_text'])
    frequent_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:5]
    group['full_text'] = group['full_text'].apply(lambda text: remove_freq_words(text, frequent_words))
    lsh, minhashes = apply_lsh(group, 'full_text')
    
    #For each productId and corresponding similar productIds as returned by LSH, compare the results
    for key in minhashes.keys():
        result = lsh.query(minhashes[key])
        result = list(set([value for value in result if key != value]))
        duplicate_dict[key] = result
        if not result:
            continue
        key_url = group.loc[group['productId'] == key, 'imageUrl'].values[0]
        result_url = group.loc[group['productId'].isin(result), 'imageUrl'].values.tolist()
        similar_images[key] = list(set(compare_images(key_url, result, result_url)))

    with open(LSH_IMAGE_RESULTS, 'a+') as f:
        print(similar_images, file=f)
    with open(LSH_RESULTS, 'a+') as f:
        print(duplicate_dict, file=f)
    with open(PROCESSED_GROUPS, 'a+') as p:
        print(group.name, file=p)

    
    image_features = {}
    print('Done')

In [None]:
df = pd.read_csv(r'Final Data.csv')
df = df.drop_duplicates(subset=['productId'])

#Just a log file to check which groups have been processed to avoid re-preprocessing
with open('Processed groups.txt', 'r') as f:
    processed_groups = []
    for line in f.readlines():
        processed_groups.append(eval(line))
try:
    processed_brands, processed_cat1 = zip(*processed_groups)

    df = df[~((df['productBrand'].isin(processed_brands)) &
              (df['sub_category1'].isin(processed_cat1)))]
except ValueError:
    pass

tqdm.pandas()
print(df.shape)


group_cols = ['productBrand', 'sub_category1']

df = df.groupby(group_cols).filter(lambda group: len(group) > 1)
grouped = df.groupby(group_cols)
grouped.progress_apply(find_all_duplicates)

### Function to compare to products

In [None]:
from datasketch import MinHash, MinHashLSH
import cv2, numpy as np, urllib, string, pandas as pd
from keras.applications.vgg19 import preprocess_input, VGG19
from sklearn.metrics.pairwise import cosine_similarity

table = str.maketrans(dict.fromkeys(string.punctuation))
model = VGG19(weights='imagenet', include_top=False)


In [None]:
### image
def compare_products(product1, product2):
    """Checks if the two given series/dicts(strictly) belong to the same product
    The keys should strictly be followed as per the dataset.
    """
    
    product1 = product1.fillna('')
    product2 = product2.fillna('')
    if product1['imageUrl'] == product2['imageUrl']:
        print('Yes')
        return
    text_cols = ['key_specs_text', 'description', 'title']
    id1 = product1['productId']

    check_image = False
    product1['full_text'] = ''
    product2['full_text'] = ''

    for col in text_cols:
        product1['full_text'] += ' ' + product1[col].translate(table)
        product2['full_text'] += ' ' + product2[col].translate(table)

    m1 = MinHash(num_perm=256)
    m2 = MinHash(num_perm=256)

    for d in ngrams(product1['full_text'], 3):
        m1.update(d.encode('utf-8'))
    for d in ngrams(product2['full_text'], 3):
        m2.update(d.encode('utf-8'))

    lsh = MinHashLSH(threshold=0.5, num_perm=256)
    lsh.insert(id1, m1)
    result = lsh.query(m2)
    if id1 in result:
        check_image = True
    if not check_image:
        print('No')
    else:
        print(product1['imageUrl'])
        print(product2['imageUrl'])

        img1 = download_image(product1['imageUrl'])
        img2 = download_image(product2['imageUrl'])

        img1 = np.expand_dims(extract_features(product1['imageUrl'], img1).flatten(), axis = 0)
        img2 = np.expand_dims(extract_features(product2['imageUrl'], img2).flatten(), axis = 0)
        print(img1.shape)
        print(img2.shape)
        cosine_mat = cosine_similarity(img1, img2)
        if cosine_mat > 0.45:
            print('Yes')
        else:
            print('No')



