<a href="https://colab.research.google.com/github/Karanbir122/CSE508_Winter2024_A2_MT23042/blob/main/Assignment_2_IR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1**

In [None]:
import torch
import os
import requests
from PIL import Image, UnidentifiedImageError
from io import BytesIO
import pandas as pd
import pickle
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torchvision import models, transforms

nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv("A2_Data.csv")

# Create a new DataFrame to hold the expanded rows
expanded_rows = []

# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    # Extract the ID and Review Text from the current row
    id_value = row['ID']
    review_text = row['Review Text']

    # Split the Image links into separate rows
    for image_link in eval(row['Image']):  # Use eval to convert string representation of list to actual list
        expanded_row = {
            'ID': id_value,
            'Image': image_link,
            'Review Text': review_text
        }
        expanded_rows.append(expanded_row)

# Create a new DataFrame from the expanded rows
expanded_df = pd.DataFrame(expanded_rows)

# Define image preprocessing
image_transforms = transforms.Compose([
    transforms.Resize((299, 299)),  # Adjust to Inception-v3 input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load a pre-trained Inception-v3 model
inception = models.inception_v3(pretrained=True)
inception.eval()  # Set the model to evaluation mode

# Function to extract features from an image using Inception-v3
def extract_image_features(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        img_t = image_transforms(img)
        img_t = img_t.unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            features = inception(img_t)
        return features.cpu().numpy().flatten()
    except requests.exceptions.RequestException as e:
        print(f"RequestException for URL {url}: {e}")
    except UnidentifiedImageError:
        print(f"UnidentifiedImageError: cannot identify image file from URL {url}. Removing entry.")
        return None  # Return None to indicate the image could not be processed
    except Exception as e:
        print(f"Unexpected error for URL {url}: {e}")
    return None

# List to store indices of rows to be removed
rows_to_remove = []

# Extract features
image_features = []

for index, row in expanded_df.iterrows():
    # Check if the 'Image' column has a valid URL
    if pd.notna(row['Image']):
        image_feature = extract_image_features(row['Image'])
        if image_feature is not None:
            image_features.append(image_feature)
        else:
            # If image could not be processed, mark the row for removal
            rows_to_remove.append(index)

# Remove rows marked for removal



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth
100%|██████████| 104M/104M [00:00<00:00, 115MB/s] 


UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg. Removing entry.
UnidentifiedImageError: cannot identify image file from URL https://images-na.ssl-images-amazon.com/images/I/710

In [None]:
cleaned_df = expanded_df.drop(rows_to_remove, inplace=False).reset_index(drop=True)

# Save the results
with open('image_features_inception.pkl', 'wb') as f:
    pickle.dump(image_features, f)

In [None]:
len(image_features)

1640

In [None]:
len(cleaned_df["Image"])

1640

In [None]:
cleaned_df["Image features"] = image_features
cleaned_df

Unnamed: 0,ID,Image,Review Text,Image features
0,3452,https://images-na.ssl-images-amazon.com/images...,Loving these vintage springs on my vintage str...,"[-0.027333215, -0.079651386, -0.35003006, -0.7..."
1,1205,https://images-na.ssl-images-amazon.com/images...,Works great as a guitar bench mat. Not rugged ...,"[-1.3684598, -0.59348595, -1.4640973, -1.30910..."
2,1205,https://images-na.ssl-images-amazon.com/images...,Works great as a guitar bench mat. Not rugged ...,"[-0.7401553, -0.9350542, -0.60569024, -0.74105..."
3,1205,https://images-na.ssl-images-amazon.com/images...,Works great as a guitar bench mat. Not rugged ...,"[-0.8307264, -0.90847826, -0.3561505, -0.40905..."
4,1708,https://images-na.ssl-images-amazon.com/images...,We use these for everything from our acoustic ...,"[-0.34685618, -0.38017753, -0.55535674, -1.372..."
...,...,...,...,...
1635,1882,https://images-na.ssl-images-amazon.com/images...,This is a great stereo reverb with plenty of c...,"[-0.2008809, -0.31048664, -0.31136578, -0.3019..."
1636,1547,https://images-na.ssl-images-amazon.com/images...,I really like the simplicity of this bridge. I...,"[0.27103233, 0.25581858, 0.122879, -0.05130465..."
1637,1547,https://images-na.ssl-images-amazon.com/images...,I really like the simplicity of this bridge. I...,"[-0.2091152, -0.26896644, -0.6505107, -0.78845..."
1638,1004,https://images-na.ssl-images-amazon.com/images...,"Great Product, but there is no warranty in the...","[-0.11801373, -0.2079926, 0.2472797, -0.412385..."


**2**

In [None]:
import pandas as pd
import numpy as np
import math
import pickle
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase conversion
    text = text.lower()
    # Remove URLs, hashtags, and mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation and non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # Stopwords removal, stemming, and lemmatization
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if not word in stop_words]
    return tokens

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Assuming cleaned_df is your DataFrame
text_data = cleaned_df['Review Text'].fillna('').tolist()

# Preprocess text data
tokenized_texts = [preprocess_text(text) for text in text_data]

# Manual TF-IDF Calculation
def compute_tf_idf(tokenized_docs):
    # Calculate TF (term frequency)
    tf = [{word: doc.count(word) / len(doc) for word in doc} for doc in tokenized_docs]

    # Calculate document frequency (DF)
    df = {}
    for doc in tokenized_docs:
        for word in set(doc):
            df[word] = df.get(word, 0) + 1

    # Calculate IDF (inverse document frequency)
    idf = {word: math.log(len(tokenized_docs) / freq) for word, freq in df.items()}

    # Calculate TF-IDF
    tf_idf = [{word: freq * idf[word] for word, freq in doc.items()} for doc in tf]
    return tf_idf

tf_idf_scores = compute_tf_idf(tokenized_texts)

# Specify paths for saving tokenized texts and TF-IDF scores
tokenized_texts_path = 'tokenized_texts.pkl'
tf_idf_scores_path = 'tf_idf_scores_manual_text.pkl'

# Save tokenized texts
with open(tokenized_texts_path, 'wb') as f:
    pickle.dump(tokenized_texts, f)

# Save TF-IDF scores
with open(tf_idf_scores_path, 'wb') as f:
    pickle.dump(tf_idf_scores, f)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
with open('tokenized_texts.pkl', 'rb') as f:
    tokenized_texts = pickle.load(f)

# Print the loaded data
print("Tokenized Texts:")
print(tokenized_texts)

Tokenized Texts:
[['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go'], ['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'screw', 'wo', 'roll', 'around', 'color', 'good'], ['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'screw', 'wo', 'roll', 'around', 'color', 'good'], ['work', 'great', 'guitar', 'bench', 'mat', 'rug', 'enough', 'abus', 'take', 'care', 'take', 'care', 'make', 'organ', 'workspac', 'much', 'easier', 'screw', 'wo', 'roll', 'around', 'color', 'good'], ['use', 'everyth', 'acoust', 'bass', 'ukulel', 'know', 'smaller', 'model', 'avail', 'uke', 'violin', 'etc', 'yet', 'order', 'work', 'smaller', 'instrument', 'one', 'extend', 'foot', 'maximum', 'width', 'gentl', 'instrument', 'grippi', 'materi', 'kee

In [None]:
with open('tf_idf_scores_manual_text.pkl', 'rb') as f:
    tokenized_texts = pickle.load(f)

# Print the loaded data
print("tf_idf_scores_manual_text:")
print(tf_idf_scores)

tf_idf_scores_manual_text:
[{'love': 0.12952440044493901, 'vintag': 0.5207925279135686, 'spring': 0.7275388495683908, 'strat': 0.20053349107638704, 'good': 0.10090490416569088, 'tension': 0.32783632473534957, 'great': 0.0764467652709918, 'stabil': 0.3336370832013249, 'float': 0.3399910951882799, 'bridg': 0.2239600168655796, 'want': 0.13798218183685834, 'way': 0.17107797425778443, 'go': 0.134530430535277}, {'work': 0.06300905158828086, 'great': 0.049856586046299, 'guitar': 0.05155849651014692, 'bench': 0.26157205042166753, 'mat': 0.26157205042166753, 'rug': 0.19866252942443596, 'enough': 0.10857290184260063, 'abus': 0.23724092920708395, 'take': 0.2373584944657685, 'care': 0.29366085478983434, 'make': 0.08559693553321758, 'organ': 0.23143521648427862, 'workspac': 0.2740799666152232, 'much': 0.0848440464022845, 'easier': 0.16038606322234505, 'screw': 0.11226825892231335, 'wo': 0.14170942584463964, 'roll': 0.20410440520504497, 'around': 0.11226825892231335, 'color': 0.11991567920334224, 'g

In [None]:
tf_idf_pd = pd.DataFrame(tf_idf_scores)
tf_idf_pd.fillna(0, inplace=True)
tf_idf_pd

Unnamed: 0,love,vintag,spring,strat,good,tension,great,stabil,float,bridg,...,yngwie,neoclass,john,mayer,importantli,toneprint,biggi,accord,screenshot,piti
0,0.129524,0.520793,0.727539,0.200533,0.100905,0.327836,0.076447,0.333637,0.339991,0.223960,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.065808,0.000000,0.049857,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.065808,0.000000,0.049857,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.065808,0.000000,0.049857,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1635,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156368,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.336475,0.000000,0.000000,0.000000,0.000000
1636,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.131741,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.131555,0.000000,0.000000,0.000000
1637,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.131741,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.131555,0.000000,0.000000,0.000000
1638,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.099713,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.321846,0.321846,0.321846


**3**

In [None]:
print(index)

1647


In [None]:
# Example input image URLs
import numpy as np

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    magnitude_v1 = np.linalg.norm(v1)
    magnitude_v2 = np.linalg.norm(v2)

    # Avoid division by zero
    if magnitude_v1 == 0 or magnitude_v2 == 0:
        return 0

    cosine_sim = dot_product / (magnitude_v1 * magnitude_v2)
    return cosine_sim
image_urls = []
url_input = input("Enter image URL (or press Enter to finish): ").strip()
review_input = input("REVIEW: ").strip()
doc1 = preprocess_text(review_input)
tf = [{word: doc1.count(word) / len(doc1) for word in doc1}]
df = {}
for doc in tokenized_texts:
    for word in set(doc):
        df[word] = df.get(word, 0) + 1

# Calculate IDF (inverse document frequency)
idf = {word: math.log(len(tokenized_texts) / freq) for word, freq in df.items()}

# Calculate TF-IDF
tf_idf_doc1 = [{word: freq * idf[word] for word, freq in df.items()}]

if url_input.startswith("[") and url_input.endswith("]"):
    # Extract URLs from within square brackets
    url_input = url_input[1:-1]
    image_urls.extend([url.strip() for url in url_input.split(",")])
elif url_input:
    # If a single URL is provided without square brackets
    image_urls.append(url_input)


# Extract features from input images
query_image_vectors = [extract_image_features(url) for url in image_urls]
query_review_vector = pd.DataFrame(tf_idf_doc1)

# Calculate similarities for images
image_similarities = []
for i, feature in enumerate(image_features):
    similarities = []
    for query_image_vector in query_image_vectors:
        feature = feature.flatten()
        cosine_sim_im = cosine_similarity(query_image_vector, feature)
        similarities.append(cosine_sim_im)

    cosine_sim_rv = cosine_similarity(query_review_vector.iloc[0], tf_idf_pd.iloc[i])
    average_similarity = sum(similarities) / len(similarities)
    composite_similarity=(cosine_sim_rv+average_similarity)/2
    image_similarities.append((i, average_similarity, cosine_sim_rv, composite_similarity))

# Sort the list of similar images based on cosine similarity in descending order
image_similarities.sort(key=lambda x: x[1], reverse=True)
review_similarities = image_similarities.copy()
composite_similarities= review_similarities.copy()

review_similarities.sort(key=lambda x: x[2], reverse=True)
composite_similarities.sort(key=lambda x: x[3], reverse=True)




review_indexes = []
visited_array_rv = []
j = 0
c = 0
while c < 3:
  index = review_similarities[[j][0]][0]
  review_id = cleaned_df["ID"][index]
  if review_id in visited_array_rv:
    j += 1
    continue
  visited_array_rv.append(review_id)
  review_indexes.append(index)
  j += 1
  c += 1

top_3_similar_reviews = []
for review_sim in review_similarities:
  idx = review_sim[0]
  if idx in review_indexes:
    top_3_similar_reviews.append(review_sim)











image_indexes = []
visited_array_img = []
j = 0
c = 0
while c < 3:
  index = image_similarities[[j][0]][0]
  review_id = cleaned_df["ID"][index]
  if review_id in visited_array_img:
    j += 1
    continue
  visited_array_img.append(review_id)
  image_indexes.append(index)
  j += 1
  c += 1

top_3_similar_images = []
for image_sim in image_similarities:
  idx = image_sim[0]
  if idx in image_indexes:
    top_3_similar_images.append(image_sim)



compo_indexes = []
visited_array_comp = []
j = 0
c = 0
while c < 3:
  index = composite_similarities[[j][0]][0]
  review_id = cleaned_df["ID"][index]
  if review_id in visited_array_comp:
    j += 1
    continue
  visited_array_comp.append(review_id)
  compo_indexes.append(index)
  j += 1
  c += 1

top_3_similar_composites = []
for compo_sim in composite_similarities:
  idx = compo_sim[0]
  if idx in compo_indexes:
    top_3_similar_composites.append(compo_sim)












# Get top 3 similar images
#top_3_similar_composites = composite_similarities[:3]

# Save top 3 similar images
with open('top_3_images.pkl', 'wb') as f:
    pickle.dump(top_3_similar_images, f)

with open('top_3_txt.pkl', 'wb') as f:
    pickle.dump(top_3_similar_reviews, f)

with open('top_3_composite.pkl', 'wb') as f:
    pickle.dump(top_3_similar_composites, f)

print("----------------------------------------------------")
print("Top 3 similar entries using IMAGE RETRIEVAL:")
c1 = 1
for i, img_similarity, txt_similarity, composite_similarities in top_3_similar_images:
    print(f"{c1}. Image URL: {cleaned_df['Image'][i]}")
    print(f"Review: {cleaned_df['Review Text'][i]}")
    print(f"Cosine score image: {img_similarity}")
    print(f"Cosine score text: {txt_similarity}")
    print(f"Cosine score composite: {composite_similarities}")
    print("\n")
    c1 += 1

print("----------------------------------------------------")
print("Top 3 similar entries using TEXT RETRIEVAL:")
c2 = 1
for i, img_similarity, txt_similarity ,composite_similarities in top_3_similar_reviews:
    print(f"{c2}. Image URL: {cleaned_df['Image'][i]}")
    print(f"Review: {cleaned_df['Review Text'][i]}")
    print(f"Cosine score image: {img_similarity}")
    print(f"Cosine score text: {txt_similarity}")
    print(f"Cosine score composite: {composite_similarities}")
    print("\n")
    c2 += 1

print("----------------------------------------------------")
print("Top 3 similar entries using COMBINED RETRIEVAL:")
c3 = 1
for i, img_similarity, txt_similarity, composite_similarities in top_3_similar_composites:
    print(f"{c3}. Image URL: {cleaned_df['Image'][i]}")
    print(f"Review: {cleaned_df['Review Text'][i]}")
    print(f"Cosine score image: {img_similarity}")
    print(f"Cosine score text: {txt_similarity}")
    print(f"Cosine score composite: {composite_similarities}")
    print("\n")
    c3 += 1

Enter image URL (or press Enter to finish): https://images-na.ssl-images-amazon.com/images/I/81SaYA3ifDL._SY88.jpg
REVIEW: This is a wonderful gig bag! I've included pictures of the more popular styles of guitar so you can see how well they fit. The purple Les Paul style fits snug length wise with the extra cushion pads, and fits snug side to side. The black Stratocaster style is ALMOST to long, but still fits fine. The grey SG style is a small guitar, so there is extra room top to bottom and side to side, but with the neck strap, it doesn't slide around too much, and I would still feel comfortable using this bag with the SG. The padding is aproximently an inch think all around and top and bottom, and the zippers are of good quality.
----------------------------------------------------
Top 3 similar entries using IMAGE RETRIEVAL:
1. Image URL: https://images-na.ssl-images-amazon.com/images/I/81SaYA3ifDL._SY88.jpg
Review: This is a wonderful gig bag! I've included pictures of the more p