In [21]:
import pandas as pd
import numpy as np
import requests
from PIL import Image, ImageEnhance
from io import BytesIO
import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string


In [22]:
df = pd.read_csv("A2_Data.csv")
df = df.dropna()
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nalishjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nalishjain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nalishjain/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
df.head()

Unnamed: 0,Id,Image,Review Text
0,3452,['https://images-na.ssl-images-amazon.com/imag...,Loving these vintage springs on my vintage str...
1,1205,['https://images-na.ssl-images-amazon.com/imag...,Works great as a guitar bench mat. Not rugged ...
2,1708,['https://images-na.ssl-images-amazon.com/imag...,We use these for everything from our acoustic ...
3,2078,['https://images-na.ssl-images-amazon.com/imag...,Great price and good quality. It didn't quite...
4,801,['https://images-na.ssl-images-amazon.com/imag...,I bought this bass to split time as my primary...


In [24]:
image_text_dict = {}
count = 0
for index, row in df.iterrows():
    urls = row['Image']
    url_list = json.loads(urls.replace("'", "\""))
    text = row['Review Text']
    image_text_dict[count] = [url_list,text]
    count += 1
print(len(image_text_dict))

999


In [25]:
def cosine_similarity(vector1, vector2):
    dot_prod = np.dot(vector1, vector2)
    mag_vector1 = np.linalg.norm(vector1)
    mag_vector2 = np.linalg.norm(vector2)

    if mag_vector1 == 0 or mag_vector2 == 0:
        return 0  
    return dot_prod / (mag_vector1 * mag_vector2)

def find_top_similar_keys(query_vector, vectors_dict, top_n=3):
    similarities = {}

    for key, vectors in vectors_dict.items():
        similarities[key] = 0
        for vector in vectors:
            similarities[key] = max(cosine_similarity(query_vector, vector), similarities[key])

    sorted_keys = sorted(similarities, key=similarities.get, reverse=True)
    top_keys = sorted_keys[:top_n]
    top_scores = [similarities[key] for key in top_keys]

    return top_keys, top_scores

def find_top_similar_keys_2(img_query_vector, text_query_vector, img_dict, text_dict, top_n=3):
    similarities = {}

    for key, vectors in img_dict.items():
        similarities[key] = 0
        text_score = cosine_similarity(text_query_vector, text_dict[key][0])
        for vector in vectors:
            similarities[key] = max((cosine_similarity(img_query_vector, vector) + text_score)/2, similarities[key])

    sorted_keys = sorted(similarities, key=similarities.get, reverse=True)
    top_keys = sorted_keys[:top_n]
    top_scores = [similarities[key] for key in top_keys]

    return top_keys, top_scores

In [26]:
def preprocess_image(img_url, factor = 1.2):
    response = requests.get(img_url)
    img = Image.open(BytesIO(response.content))

    img = img.resize((224, 224)) 
    enhancer = ImageEnhance.Brightness(img) 
    img = enhancer.enhance(factor)

    datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True) 

    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array =  datagen.flow(img_array).next()
    img_array = preprocess_input(img_array)
    
    return img_array

def extract_image_features(model, img_array):
    features = model.predict(img_array)
    features = features.flatten()
    return features

def preprocess_text(text):
    text = text.lower()

    tokens = word_tokenize(text)

    # Removing punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Stop word removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [27]:
invalid_url_ids = []
for i in range(len(image_text_dict)):
    valid_urls = []
    for img_url in image_text_dict[i][0]:
        try:
            Image.open(BytesIO(requests.get(img_url).content))
            valid_urls.append(img_url)
        except:
            print("Invalid Url", i)
    image_text_dict[i][0] = valid_urls
# image_text_dict = {key: value for key, value in image_text_dict.items() if key not in invalid_url_ids}
print(len(image_text_dict))

Invalid Url 67
Invalid Url 67
Invalid Url 110
Invalid Url 110
Invalid Url 523
Invalid Url 701
Invalid Url 859
Invalid Url 935
999


In [28]:
base_model = VGG16(weights='imagenet')
image_model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)


In [29]:
url_ids = list(image_text_dict.keys())

img_features = []
img_feature_urls = []
img_features_dict = {}

for url_id in url_ids:   
    for img_url in image_text_dict[url_id][0]:
        img_array = preprocess_image(img_url)
        img_feature = extract_image_features(image_model, img_array)
        img_features.append(img_feature.reshape(4096))
        img_feature_urls.append(url_id)

img_features = normalize(np.array(img_features), norm='l2', axis=1)


count = 0
for url_id in img_feature_urls:
    img_features_dict[url_id] = []

for url_id in img_feature_urls:
    img_features_dict[url_id].append(img_features[count])
    count +=1

with open('img_features_dict.pkl', 'wb') as f:
    pickle.dump(img_features_dict, f)



In [30]:
print(len(set(img_features_dict)))

993


In [31]:
url_ids = set(image_text_dict.keys())
url_ids.difference(set(img_features_dict.keys()))

{67, 110, 523, 701, 859, 935}

In [32]:
url_ids = list(img_features_dict.keys())
text_reviews = {}
for url_id in url_ids:
    pp_text = preprocess_text(image_text_dict[url_id][1])
    text_reviews[url_id] = pp_text
# print(text_reviews)

tf = {}
idf = {}
tf_idf = {}
word_id = {}
word_id_iter = 0
for url_id in text_reviews.keys():
    # calculating idf
    word_list = text_reviews[url_id].split()

    tf[url_id] = {}
    for word in set(word_list):
        if word not in idf:
            idf[word] = 1
            word_id[word] = word_id_iter
            word_id_iter += 1
        else:
            idf[word] += 1
    # calculating tf
    for word in word_list:
        if word not in tf[url_id]:
            tf[url_id][word] = 1 
        else:
            tf[url_id][word] += 1             

for url_id in text_reviews.keys():
    word_list = text_reviews[url_id].split()
    tf_idf[url_id] = [np.zeros(shape = (len(idf)))]

    for word in set(word_list): 
        # tf_idf[url_id][0][word_id[word]] = np.log(len(image_text_dict)/idf[word])*tf[url_id][word]/len(word_list)       
        tf_idf[url_id][0][word_id[word]] = np.log(len(image_text_dict)/idf[word])*tf[url_id][word] 
  

# print(tf_idf[0])
# print(word_id)
# print(tf_idf[0].shape)
with open('tf_idf.pkl', 'wb') as f:
    pickle.dump(tf_idf, f)


In [33]:
file_path = 'img_features_dict.pkl'
with open(file_path, 'rb') as pickle_file:
    img_features_dict = pickle.load(pickle_file)

file_path = 'tf_idf.pkl'
with open(file_path, 'rb') as pickle_file:
    tf_idf = pickle.load(pickle_file)

In [34]:
def user_input(img_url, text):
    img_array = preprocess_image(img_url)
    img_vector = extract_image_features(image_model, img_array).reshape(4096)
    top_url_ids_img, cosine_scores_img = find_top_similar_keys(img_vector, img_features_dict)

    pp_text =  preprocess_text(text)
    words = pp_text.split(" ")
    text_vector = np.zeros(shape = (len(idf)))

    freq_words = {}
    for word in words: 
        if word not in freq_words:
            freq_words[word] = 1
        else:
            freq_words[word] += 1
    
    for word in set(words):
        # text_vector[word_id[word]] = freq_words[word]*np.log(len(image_text_dict)/idf[word])/len(words)     
        text_vector[word_id[word]] = freq_words[word]*np.log(len(image_text_dict)/idf[word])    
    
    top_url_ids_text, cosine_scores_text = find_top_similar_keys(text_vector, tf_idf)

    top_url_ids_composite, cosine_scores_text = find_top_similar_keys_2(img_vector, text_vector, img_features_dict, tf_idf)
    
    print("Using Image Retreival")
    print("1. Image Url :" , image_text_dict[top_url_ids_img[0]][0])
    print("   Review : ", image_text_dict[top_url_ids_img[0]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_img[0]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_img[0]][0]))

    print("2. Image Url :" , image_text_dict[top_url_ids_img[1]][0])
    print("   Review : ", image_text_dict[top_url_ids_img[1]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_img[1]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_img[1]][0]))

    print("3. Image Url :" , image_text_dict[top_url_ids_img[2]][0])
    print("   Review : ", image_text_dict[top_url_ids_img[2]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_img[2]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_img[2]][0]))

    print()
    print("Using text Retreival")
    print("1. Image Url :" , image_text_dict[top_url_ids_text[0]][0])
    print("   Review : ", image_text_dict[top_url_ids_text[0]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_text[0]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_text[0]][0]))

    print("2. Image Url :" , image_text_dict[top_url_ids_text[1]][0])
    print("   Review : ", image_text_dict[top_url_ids_text[1]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_text[1]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_text[1]][0]))

    print("3. Image Url :" , image_text_dict[top_url_ids_text[2]][0])
    print("   Review : ", image_text_dict[top_url_ids_text[2]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_text[2]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_text[2]][0]))             
    
    print()
    print("Using Composite Retreival")
    print("1. Image Url :" , image_text_dict[top_url_ids_composite[0]][0])
    print("   Review : ", image_text_dict[top_url_ids_composite[0]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_composite[0]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_composite[0]][0]))
    print(" Composite score : ", (cosine_similarity(img_vector, img_features_dict[top_url_ids_composite[0]][0])+cosine_similarity(text_vector, tf_idf[top_url_ids_composite[0]][0]))/2)

    print("2. Image Url :" , image_text_dict[top_url_ids_composite[1]][0])
    print("   Review : ", image_text_dict[top_url_ids_composite[1]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_composite[1]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_composite[1]][0]))
    print(" Composite score : ", (cosine_similarity(img_vector, img_features_dict[top_url_ids_composite[1]][0])+cosine_similarity(text_vector, tf_idf[top_url_ids_composite[1]][0]))/2)

    print("3. Image Url :" , image_text_dict[top_url_ids_composite[2]][0])
    print("   Review : ", image_text_dict[top_url_ids_composite[2]][1])
    print("   Cosine score of images : ", cosine_similarity(img_vector, img_features_dict[top_url_ids_composite[2]][0]))
    print("   Cosine score of text : ", cosine_similarity(text_vector, tf_idf[top_url_ids_composite[2]][0]))  
    print(" Composite score : ", (cosine_similarity(img_vector, img_features_dict[top_url_ids_composite[2]][0])+cosine_similarity(text_vector, tf_idf[top_url_ids_composite[2]][0]))/2)


In [35]:
test_url = 'https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg'
test_text = 'I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.'
user_input(test_url, test_text)

Using Image Retreival
1. Image Url : ['https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg']
   Review :  I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
   Cosine score of images :  0.77935535
   Cosine score of text :  1.0
2. Image Url : ['https://images-na.ssl-images-amazon.com/images/I/719-SDMiOoL._SY88.jpg']
   Review :  These locking tuners look great and keep tune.  Good quality materials and construction.  Excellent upgrade to any guitar.  I had to drill additions holes for installation.  If your neck already comes with pre-drilled holes, then they should drop right in, otherwise you will need to buy a guitar tuner pin drill jig, also available from Amazon.
   Cosine score of images :  0.6798063
   Cosine score of text :  0.1041278203226515
3. Image Url : ['https://images-na.ssl-images-amazon.com/images/I/711kGbkdzEL._SY88.jpg']
   

In [36]:
# example_img_url = image_text_dict[70][0]
# preprocessed_img_array = preprocess_image(example_img_url, factor = 1.2)
# print(preprocessed_img_array.shape)
# # Convert the preprocessed image array back to a PIL Image for visualization
# preprocessed_img = image.array_to_img(preprocessed_img_array[0])

# # Display the original and preprocessed images
# plt.figure(figsize=(8, 4))
# plt.subplot(1, 2, 1)
# plt.title("Original Image")
# plt.imshow(Image.open(BytesIO(requests.get(example_img_url).content)))
# plt.axis("off")

# plt.subplot(1, 2, 2)
# plt.title("Preprocessed Image")
# plt.imshow(preprocessed_img)
# plt.axis("off")
# plt.show()