In [9]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=b9657583f473b58b8d69d500a12e697f307b6936b50ab1f03899795a5cc987e4
  Stored in directory: 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

url = "https://github.com/amazon-science/esci-data/raw/main/shopping_queries_dataset/shopping_queries_dataset_products.parquet"
df = pd.read_parquet(url)

# Combine product_title and product_description columns
df['combined_text'] = df['product_title'] + ' ' + df['product_description']


print(df.head())


   product_id                                      product_title  \
0  B079VKKJN7  11 Degrees de los Hombres Playera con Logo, Ne...   
1  B079Y9VRKS          Camiseta Eleven Degrees Core TS White (M)   
2  B07DP4LM9H  11 Degrees de los Hombres Core Pull Over Hoodi...   
3  B07G37B9HP          11 Degrees Poli Panel Track Pant XL Black   
4  B07LCTGDHY  11 Degrees Gorra Trucker Negro OSFA (Talla úni...   

                                 product_description  \
0  Esta playera con el logo de la marca Carrier d...   
1                                               None   
2  La sudadera con capucha Core Pull Over de 11 G...   
3                                               None   
4                                               None   

                                product_bullet_point product_brand  \
0  11 Degrees Negro Playera con logo\nA estrenar ...    11 Degrees   
1                                               None    11 Degrees   
2  11 Degrees Azul Core Pull Over Hoodie\nA 

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

def preprocess_text(text):
    # Check for NaN values
    if pd.isnull(text):
        return ''

    text = text.lower()

    # Remove special characters, numbers, and extra whitespaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

df['processed_text'] = df['combined_text'].apply(preprocess_text)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

# Convert text data into numerical representations using BERT embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(df['combined_text'].values, convert_to_tensor=True)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Calculate cosine similarity between queries and products
similarity_matrix = cosine_similarity(model.encode(val_df['combined_text'].values, convert_to_tensor=True), embeddings)


.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [8]:
print(df.columns)


Index(['product_id', 'product_title', 'product_description',
       'product_bullet_point', 'product_brand', 'product_color',
       'product_locale', 'combined_text', 'processed_text'],
      dtype='object')


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# Define the model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

history = model.fit(X_train, train_df['relevance'].values, epochs=10, batch_size=32, validation_data=(X_val, val_df['relevance'].values))
