# Libraries

In [168]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from wordcloud import WordCloud
import time
import re
from transformers import BertTokenizer, TFBertForSequenceClassification,  BertModel
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import torch
from spellchecker import SpellChecker
import nltk
from nltk.tokenize import word_tokenize
import emoji, contractions
import string

input_path = 'inputs\\data\\amazon'
output_path = 'outputs\\data\\amazon'

# Data overview

In [53]:
# Load data 
df = pd.read_csv(f'{output_path}\\amazon-product.csv')

In [54]:
df.head(3)

Unnamed: 0,product_id,product_name,about_product,category_1,category_2,discounted_price,actual_price,discount_percentage,rating,rating_count,img_link,product_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,High Compatibility : Compatible With iPhone 12...,Computers & Accessories,Accessories & Peripherals,399.0,1099.0,0.64,4.2,24269.0,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,"Compatible with all Type C enabled devices, be...",Computers & Accessories,Accessories & Peripherals,199.0,349.0,0.43,4.0,43994.0,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,„Äê Fast Charger& Data Sync„Äë-With built-in safet...,Computers & Accessories,Accessories & Peripherals,199.0,1899.0,0.9,3.9,7928.0,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Sounce-iPhone-Charging-C...


In [199]:
# https://www.analyticsvidhya.com/blog/2022/01/text-cleaning-methods-in-nlp/
text = "She'd like to drink 1 coffee, OMG|üéÅ„Äêsas (2.3 samsung)!!... üî•üî•"

def text_cleaning(text):
    # Lowercase the text
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)

    # Remove numbers, special characters, and patterns
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    
    # Split words containing non-letter characters and keep only letter parts
    words = text.split()
    cleaned_words = []
    for word in words:
        parts = re.findall(r'\b[a-zA-Z]+\b', word)
        cleaned_words.extend(parts)

    # Combine cleaned words into a single string
    text = " ".join(cleaned_words)
    
    # Remove emoji and emoticons
    text = ''.join(c for c in text if c not in emoji.EMOJI_DATA)
    emoticon_pattern = r'(:-?\))|(:\))|(:-\()|(:\()|(:-?D)|(:D)|(:-?])|(:])|(:-?\[)|(:\[)|(:-?p)|(:p)|(:-?[|/\\])|(:[|/\\])'
    text = re.sub(emoticon_pattern, '', text)
    
    # Remove extra spaces
    text = " ".join(text.split())
    
    return text

cleaned_text = text_cleaning(text)
print(cleaned_text)

she would like to drink coffee omg sas samsung


In [200]:
def descriptions(df):
    description_list = df['product_name'].str.cat([df['about_product'],
                                                        df['category_1'], df['category_2']], sep=' ')
    
    description_list = list(description_list)

    for i in range(len(description_list)):
        description_list[i] = text_cleaning(description_list[i])

    
    return description_list

In [224]:
description_list = descriptions(df)
description_df = pd.DataFrame(description_list)
description_df = description_df.rename(columns={0: 'description'})
description_df

Unnamed: 0,description
0,wayona nylon braided usb to lightning fast cha...
1,ambrane unbreakable w a fast charging m braide...
2,sounce fast phone charging cable data sync usb...
3,boat deuce usb in typec micro usb stress resis...
4,portronics konnect l m fast charging a pin usb...
...,...
1323,noir aqua pcs pp spun filter spanner for all t...
1324,prestige delight prwo electric rice cooker l w...
1325,bajaj majesty rx watts heat convector room hea...
1326,havells ventil air dsp mm exhaust fan pista gr...


In [225]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the product data to create TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(description_df['description'])

In [227]:
def create_wordbank(description_list, save=False, path='outputs\\data\\amazon'):
    # initialize NLTK's tokenizer
    # nltk.download('punkt')

    # create a set to store the unique words
    word_bank = set()

    # tokenize the text data and add unique words to the word bank
    for text in description_list:
        words = word_tokenize(text)
        word_bank.update(words)

    # save the word bank to a text file with UTF-8 encoding
    if save:
        with open(f'{path}\\word_bank.txt', 'w', encoding='utf-8') as file:
            for word in word_bank:
                file.write(word + '\n')

    return word_bank

In [228]:
def call_workbank(path='outputs\\data\\amazon'):
    
    with open(f'{path}\\word_bank.txt', 'r', encoding='utf-8') as file:
        word_bank = file.read()

    return word_bank

In [229]:
word_bank = create_wordbank(description_list, save=True)

In [230]:
def correct_spellings(text):
    spell = SpellChecker()

    corrected_text = []
    words = text.split()
    for word in words:
        if word not in word_bank:
            corrected_word = spell.correction(word)
        else:  corrected_word = word

        corrected_text.append(corrected_word)
    if None in corrected_text:
        return str()

    return ' '.join(corrected_text)

# Apply spell checking to the search query
search_query = 'samsung'
corrected_query = correct_spellings(search_query)
corrected_query

'samsung'

In [19]:
def search_query(query, top=None):

    corrected_query = correct_spellings(query)

    # Preprocess and vectorize the search query
    query_vector = tfidf_vectorizer.transform([corrected_query])

    # Calculate cosine similarity between the query and all products
    cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten() 
    cosine_indices = cosine_similarities.argsort()[::-1]

    if top is None: top = len(cosine_indices)

    top_indices = cosine_indices[:top]
    top_similarities = [cosine_similarities[i] for i in top_indices]

    # Retrieve and display the top N similar products
    top_products = [df['product_id'][i] for i in top_indices]

    return top_products, top_similarities


In [5]:
def search_df(top_products):
    df_top_products = df.loc[df.product_id.isin(top_products)]

    # Create a dictionary that maps each product ID to its position in the top_products list
    order_dict = {product_id: index for index, product_id in enumerate(top_products)}

    # Create a new column in the df_top_products DataFrame that contains the position of each product ID
    df_top_products['order'] = df_top_products.product_id.map(order_dict)

    # Sort the df_top_products DataFrame by the 'order' column
    df_top_products = df_top_products.iloc[np.argsort(df_top_products.order)]

    # Drop the 'order' column from the resulting DataFrame
    df_top_products = df_top_products.drop('order', axis=1)

    return df_top_products

In [21]:
query = 'bok'
top_products, _ = search_query(query, top=10)
df_top = search_df(top_products)
df_top.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top_products['order'] = df_top_products.product_id.map(order_dict)


Unnamed: 0,product_id,product_name,about_product,category_1,category_2,discounted_price,actual_price,discount_percentage,rating,rating_count,img_link,product_link
1327,B01486F4G6,Borosil Jumbo 1000-Watt Grill Sandwich Maker (...,"Brand-Borosil, Specification √¢‚Ç¨‚Äú 23V ~ 5Hz;1 W...",Home & Kitchen,Kitchen & Home Appliances,2863.0,3690.0,0.22,4.3,6987.0,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Borosil-Jumbo-1000-Watt-...
435,B0B53QFZPY,PTron Newly Launched Force X10 Bluetooth Calli...,Bluetooth Calling Smartwatch: Make & receive a...,Electronics,Wearable Technology,1299.0,5999.0,0.78,3.3,4415.0,https://m.media-amazon.com/images/I/41k-VlGbYn...,https://www.amazon.in/PTron-Force-Bluetooth-Sm...
437,B01F25X6RQ,Samsung Ehs64 Ehs64Avfwecinu Hands-Free Wired ...,In Ear Volume Control|Design: Canal phone|Head...,Electronics,"Headphones, Earbuds & Accessories",499.0,499.0,0.0,4.2,31539.0,https://m.media-amazon.com/images/I/31FzYVC62w...,https://www.amazon.in/Samsung-Original-EHS64AV...
438,B0B244R4KB,Spigen EZ Fit Tempered Glass Screen Protector ...,Compatibility ; Glass Screen Protector Compati...,Electronics,Mobiles & Accessories,999.0,2899.0,0.66,4.6,6129.0,https://m.media-amazon.com/images/I/51R2kfyMW5...,https://www.amazon.in/Spigen-Tempered-Screen-P...
439,B0BMGG6NKT,"Samsung Galaxy M04 Dark Blue, 4GB RAM, 128GB S...",Powerful MediaTek Helio P35 Octa Core 2.3GHz w...,Electronics,Mobiles & Accessories,10499.0,13499.0,0.22,4.2,284.0,https://m.media-amazon.com/images/I/41kg-+XWox...,https://www.amazon.in/Samsung-Galaxy-Storage-M...
440,B092JHPL72,"SWAPKART Flexible Mobile Tabletop Stand, Metal...","Cell Phone Holder, Lazy Bracket, The Clip-On H...",Electronics,Mobiles & Accessories,251.0,999.0,0.75,3.7,3234.0,https://m.media-amazon.com/images/I/41pQWwAzVy...,https://www.amazon.in/SWAPKART-Flexible-Deskto...
441,B09GFM8CGS,"Redmi 9A Sport (Carbon Black, 2GB RAM, 32GB St...",Processor: MediaTek Helio G25 Octa-core; Up to...,Electronics,Mobiles & Accessories,6499.0,7999.0,0.19,4.1,313832.0,https://m.media-amazon.com/images/I/41P4Al+S3z...,https://www.amazon.in/Redmi-9A-Sport-Octa-core...
442,B0B3MWYCHQ,Fire-Boltt Ring 3 Smart Watch 1.8 Biggest Disp...,Fire-Boltt is India' No 1 Wearable Watch Brand...,Electronics,Wearable Technology,2999.0,9999.0,0.7,4.2,20879.0,https://m.media-amazon.com/images/I/41ziJKWj9L...,https://www.amazon.in/Fire-Boltt-Bluetooth-Ass...
443,B09J2MM5C6,Amozo Ultra Hybrid Camera and Drop Protection ...,QUALITY MATERIAL - Hybrid technology that is m...,Electronics,Mobiles & Accessories,279.0,1499.0,0.81,4.2,2646.0,https://m.media-amazon.com/images/I/41cYSMom9T...,https://www.amazon.in/Amozo-Cover-iPhone-Polyc...
444,B07Q4QV1DL,ELV Aluminum Adjustable Mobile Phone Foldable ...,Multi-angle adjustable stand: Free rotation of...,Electronics,Mobiles & Accessories,269.0,1499.0,0.82,4.5,28978.0,https://m.media-amazon.com/images/I/31hDWwY8iW...,https://www.amazon.in/Aluminum-Adjustable-Mobi...


In [45]:
from fuzzywuzzy import fuzz

def fuzzy_match(query, terms, threshold=80):
    matching_terms = []
    for term in terms:
        similarity = fuzz.ratio(query, term)
        if similarity >= threshold:
            matching_terms.append(term)
    return matching_terms

# Example usage
search_query = 'notebok'
matching_terms = fuzzy_match(search_query, product_descriptions)


AttributeError: 'csr_matrix' object has no attribute 'keys'

In [48]:
name_array = tfidf_matrix.toarray()