In [None]:
import pandas as pd
from PIL import Image
import requests

data = pd.read_csv("/content/drive/MyDrive/A2_Data.csv")
image_dataframe = pd.DataFrame(columns=["ProductID", "Image", "Review Text"])

for i, image_urls in enumerate(data["Image"]):
    try:
        url_lst = image_urls.strip('][').split(', ')
        rows = []
        for url in url_lst:
            url = url.strip("'")
            image = Image.open(requests.get(url, stream=True).raw)
            row = {
                'ProductID': data.index[i],
                'Image': url,
                'Review Text': data.loc[i, 'Review Text']
            }
            rows.append(row)
        image_dataframe = pd.concat([image_dataframe, pd.DataFrame(rows)], ignore_index=True)
    except Exception as e:
        print(f"Error index {i}: {e}") #printing and skiping productID having invalid image url


Error index 67: cannot identify image file <_io.BytesIO object at 0x7aa038be2cf0>
Error index 110: cannot identify image file <_io.BytesIO object at 0x7aa03d4527a0>
Error index 523: cannot identify image file <_io.BytesIO object at 0x7aa03d503650>
Error index 701: cannot identify image file <_io.BytesIO object at 0x7aa03d503240>
Error index 860: cannot identify image file <_io.BytesIO object at 0x7aa03d5030b0>
Error index 936: cannot identify image file <_io.BytesIO object at 0x7aa0754f13a0>


In [None]:
import pickle
with open('image_dataframe(inital).pkl', 'wb') as f:
    pickle.dump(image_dataframe, f) # dumping image_dataframe for further use

In [None]:
import torch
import numpy as np
import pandas as pd
import requests
from torchvision import models, transforms
from PIL import Image
from io import BytesIO




#transformation preprocessing the image (randomFlips,adjusting tone,resize and normalize)
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),

    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load pre-trained model(resnet 18)
model = models.resnet18(pretrained = True)
model.eval()

# Function to preprocess the image, extract features, and normalize them
def extract_features_from_image(url):
    try:
        response = requests.get(url)
        image = Image.open(BytesIO(response.content))
        image = image.convert("RGB")
        image_tensor = transform(image).unsqueeze(0)
        with torch.no_grad():
            features = model(image_tensor)  # Extract features using the model
            features = torch.nn.functional.normalize(features, p=2, dim=1)  #Feature Normalization using pytorch
            features = features.squeeze().view(-1).numpy()  # Flatten the vector
        return features
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

# preprocessing and feature extraction of the dataset (commented part)
#image_dataframe['Features'] = image_dataframe['Image'].apply(extract_features_from_image)


In [None]:
import pickle
with open('image_features2.pkl', 'wb') as f:
    pickle.dump(image_dataframe, f)# dump updated image_dataframe(with normalized image features)

In [None]:
import pickle
with open('/content/drive/MyDrive/image_dataframe(inital).pkl', 'rb') as file:
 image_dataframe_i = pickle.load(file)
image_dataframe_i #inital image_dataframe with (image and review text coloumn)

In [None]:
import pandas as pd
import numpy as np
import math
import pickle
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from bs4 import BeautifulSoup

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# extract content between html tages from the dataframe
def remove_tags(html):
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script']):
        data.decompose()
    return ' '.join(soup.stripped_strings)

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function for text preprocessing
def preprocess_text(text):
    text = remove_tags(text)
    text = text.lower()
    tokens = word_tokenize(text)

    tokens = [word for word in tokens if word.isalpha()]
    tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stop_words]
    return tokens

#image_dataframe (inital image_dataframe)
text_data = image_dataframe_i['Review Text'].fillna('').tolist()

# Preprocess text data
tokenized_texts = [preprocess_text(text) for text in text_data]#######


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  soup = BeautifulSoup(html, "html.parser")


In [None]:
import pickle
with open('tokenized_texts.pkl', 'wb') as f:
    pickle.dump(tokenized_texts, f)

In [None]:
import pandas as pd

# Create a DataFrame for tokenized texts
df_tokenized_texts = pd.DataFrame({'Tokenized_Text': tokenized_texts})

# Display the DataFrame
df_tokenized_texts


In [None]:
import pandas as pd
import numpy as np
import math

# Create a DataFrame for tokenized texts
df_tokenized_texts = pd.DataFrame({'Tokenized_Text': tokenized_texts})

# Calculate Term Frequency (TF)
def calculate_tf(df):
    tf_scores = {}
    for idx, row in df.iterrows():
        word_counts = pd.Series(row['Tokenized_Text']).value_counts().to_dict()
        total_words = sum(word_counts.values())
        tf_scores[idx] = {word: count / total_words for word, count in word_counts.items()}
    return tf_scores

# Calculate Inverse Document Frequency (IDF)
def calculate_idf(df):
    document_frequency = {}
    num_documents = len(df)

    # Count how many documents contain each word
    for idx, row in df.iterrows():
        unique_words = set(row['Tokenized_Text'])
        for word in unique_words:
            document_frequency[word] = document_frequency.get(word, 0) + 1

    # Calculate IDF scores
    idf_scores = {word: math.log(num_documents / (freq + 1)) for word, freq in document_frequency.items()}
    return idf_scores

# Calculate TF-IDF scores
def calculate_tf_idf(df, tf_scores, idf_scores):
    tf_idf_scores = {}
    for idx, row in df.iterrows():
        tf_idf_scores[idx] = {}
        for word in row['Tokenized_Text']:
            tf_idf = tf_scores[idx].get(word, 0) * idf_scores.get(word, 0)
            tf_idf_scores[idx][word] = tf_idf
    return tf_idf_scores

# Calculate TF scores
tf_scores = calculate_tf(df_tokenized_texts)

# Calculate IDF scores
idf_scores = calculate_idf(df_tokenized_texts)

# Calculate TF-IDF scores
tf_idf_scores = calculate_tf_idf(df_tokenized_texts, tf_scores, idf_scores)

# Convert TF-IDF scores to DataFrame
tf_idf_df = pd.DataFrame(tf_idf_scores).fillna(0)
tf_idf_df = tf_idf_df.T

  word_counts = pd.Series(row['Tokenized_Text']).value_counts().to_dict()


In [None]:
tf_idf_df

In [None]:
import pickle
with open('tf-idf_feature.pkl', 'wb') as f:
    pickle.dump(tf_idf_df, f)

In [None]:
import pickle
with open('/content/drive/MyDrive/image_features2.pkl', 'rb') as file:
  image_features = pickle.load(file)
image_features

In [None]:
import pickle
with open('/content/drive/MyDrive/tf-idf_feature.pkl', 'rb') as file:
  tf_idf = pickle.load(file)
#tf_idf

In [None]:
import pandas as pd
import pickle
with open('/content/drive/MyDrive/tokenized_texts.pkl', 'rb') as file:
  token = pickle.load(file)

# Create a DataFrame for tokenized texts
df_tokenized_texts = pd.DataFrame({'Tokenized_Text': token})

# Display the DataFrame
#df_tokenized_texts

In [None]:
import numpy as np
import pandas as pd
import math
import pickle

#cosine similarity function
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude_v1 = np.linalg.norm(v1)
    magnitude_v2 = np.linalg.norm(v2)

    # Avoid division by zero
    if magnitude_v1 == 0 or magnitude_v2 == 0:
        return 0

    cosine_sim = dot_product / (magnitude_v1 * magnitude_v2)
    return cosine_sim

image_urls = []
url_input = input("Enter image URL: ").strip() # input url
review_input = input("Enter Text Review: ").strip() #input Text review
doc1 = preprocess_text(review_input)

# TF Calculation
tf = [{word: doc1.count(word) / len(doc1) for word in doc1}]
df = {}
for doc in df_tokenized_texts['Tokenized_Text']:
    for word in set(doc):
        df[word] = df.get(word, 0) + 1

# IDF Calculation (inverse document frequency)
idf = {word: math.log(len(image_features) / freq) for word, freq in df.items()}

#TF-IDF calculation
tf_idf_doc1 = [{word: freq * idf.get(word, 0) for word, freq in doc.items()} for doc in tf]

#TF calculation function
def calculate_tf(df):
    tf_scores = {}
    for idx, row in df.iterrows():
        word_counts = pd.Series(row['Tokenized_Text']).value_counts().to_dict()
        total_words = sum(word_counts.values())
        tf_scores[idx] = {word: count / total_words for word, count in word_counts.items()}
    return tf_scores

#IDF calculation function
def calculate_idf(df):
    document_frequency = {}
    num_documents = len(df)

    for idx, row in df.iterrows():
        unique_words = set(row['Tokenized_Text'])
        for word in unique_words:
            document_frequency[word] = document_frequency.get(word, 0) + 1

    # Calculate IDF scores
    idf_scores = {word: math.log(num_documents / (freq + 1)) for word, freq in document_frequency.items()}
    return idf_scores

#TF-IDF calculation
def calculate_tf_idf(df, tf_scores, idf_scores):
    tf_idf_scores = {}
    for idx, row in df.iterrows():
        tf_idf_scores[idx] = {}
        for word in row['Tokenized_Text']:
            tf_idf = tf_scores[idx].get(word, 0) * idf_scores.get(word, 0)
            tf_idf_scores[idx][word] = tf_idf
    return tf_idf_scores

max_features = len(tf_idf.columns)
for doc in tf_idf_doc1:
    for word in list(doc.keys()):
        if word not in tf_idf.columns:
            del doc[word]
    doc.update({word: 0 for word in tf_idf.columns if word not in doc})

if url_input.startswith("[") and url_input.endswith("]"):
    url_input = url_input[1:-1]
    image_urls.extend([url.strip() for url in url_input.split(",")])
elif url_input:

    image_urls.append(url_input)

#Images feature exactraction for input URL
query_image_vectors = [extract_features_from_image(url) for url in image_urls]
query_review_vector = pd.DataFrame(tf_idf_doc1, columns=tf_idf.columns)


#Cosine similarities for images
image_similarities = []
for i, feature in enumerate(image_features['Features']):
    similarities = []
    for query_image_vector in query_image_vectors:
        query_image_vector_reshaped = query_image_vector.reshape(1, -1)  # Reshape query_image_vector
        cosine_sim_im = cosine_similarity(query_image_vector_reshaped, feature)
        similarities.append(cosine_sim_im)

    if i < len(tf_idf):
        cosine_sim_rv = cosine_similarity(query_review_vector.iloc[0], tf_idf.iloc[i])
    else:
        cosine_sim_rv = 0

    average_similarity = sum(similarities) / len(similarities)
    composite_similarity = (cosine_sim_rv + average_similarity) / 2
    image_similarities.append((i, average_similarity, cosine_sim_rv, composite_similarity))

# Sorting in descending order
image_similarities.sort(key=lambda x: x[1], reverse=True)
review_similarities = image_similarities.copy()
composite_similarities = image_similarities.copy()

review_similarities.sort(key=lambda x: x[2], reverse=True)
composite_similarities.sort(key=lambda x: x[3], reverse=True)

# Initialize a set to keep track of IDs
processed_ids = set()

#3 similar images
top_similar_images = []
for i, img_similarity, txt_similarity, composite_similarity in image_similarities:
    if image_features['ProductID'][i] not in processed_ids:
        top_similar_images.append((i, img_similarity, txt_similarity, composite_similarity))
        processed_ids.add(image_features['ProductID'][i])
    if len(top_similar_images) == 3:
        break

# 3 similar reviews Text
top_similar_reviews = []
for i, img_similarity, txt_similarity, composite_similarity in review_similarities:
    if image_features['ProductID'][i] not in processed_ids:
        top_similar_reviews.append((i, img_similarity, txt_similarity, composite_similarity))
        processed_ids.add(image_features['ProductID'][i])
    if len(top_similar_reviews) == 3:
        break

# 3 similar composite scores
top_similar_composites = []
for i, avg_similarity, txt_similarity, composite_similarity in composite_similarities:
    if image_features['ProductID'][i] not in processed_ids:
        top_similar_composites.append((i, avg_similarity, txt_similarity, composite_similarity))
        processed_ids.add(image_features['ProductID'][i])
    if len(top_similar_composites) == 3:
        break

# Save top 3 similar images and reviews
with open('top_similar_images.pkl', 'wb') as f:
    pickle.dump(top_similar_images, f)

with open('top_similar_reviews.pkl', 'wb') as f:
    pickle.dump(top_similar_reviews, f)

with open('top_similar_composites.pkl', 'wb') as f:
    pickle.dump(top_similar_composites, f)

# Function to print top similar items
def print_top_similar_items(title, items):
    print(f"Top 3 similar {title}:")
    for i, similarity_scores in enumerate(items):
        print(f"{title.capitalize()} {i + 1}:")
        print(f"Image URL: {image_features['Image'][similarity_scores[0]]}")
        print(f"Text Review: {image_features['Review Text'][similarity_scores[0]]}")
        print(f"Image Score: {similarity_scores[1]}")
        print(f"Review Text Score : {similarity_scores[2]}")
        print(f"Composite Score: {similarity_scores[3]}\n")


# Print top similar items
print_top_similar_items("images", top_similar_images)
print_top_similar_items("reviews", top_similar_reviews)
print_top_similar_items("composite scores", top_similar_composites)


Enter image URL (or press Enter to finish): https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
REVIEW: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
Top 3 similar images:
Images 1:
Image URL: https://images-na.ssl-images-amazon.com/images/I/71bztfqdg+L._SY88.jpg
Text Review: I have been using Fender locking tuners for about five years on various strats and teles. Definitely helps with tuning stability and way faster to restring if there is a break.
Image Score: [0.94093215]
Review Text Score : 0.9998998814927471
Composite Score: [0.970416]

Images 2:
Image URL: https://images-na.ssl-images-amazon.com/images/I/719-SDMiOoL._SY88.jpg
Text Review: These locking tuners look great and keep tune.  Good quality materials and construction.  Excellent upgrade to any guitar.  I had to drill additions holes for installation.  If your neck already co