In [2]:
import pandas as pd
import glob
import os
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

In [5]:
# Step 2: Load Amazon Sales Dataset
def load_amazon_sales_data():
    folder_path = 'D:/PycharmProjects/pythonProject/archive'  # Change this to your dataset path
    all_files = glob.glob(os.path.join(folder_path, "*.csv"))
    
    df_list = []
    for file in all_files:
        df = pd.read_csv(file)
        df_list.append(df)
    
    return pd.concat(df_list, ignore_index=True)

In [6]:
# Load the Amazon Sales Dataset
amazon_sales_data = load_amazon_sales_data()
print("Amazon Sales Data Loaded:")
print(amazon_sales_data.head())  # Display the first few rows


Amazon Sales Data Loaded:
                                                name main_category  \
0  Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...    appliances   
1  LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C...    appliances   
2  LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop...    appliances   
3  LG 1.5 Ton 3 Star AI DUAL Inverter Split AC (C...    appliances   
4  Carrier 1.5 Ton 3 Star Inverter Split AC (Copp...    appliances   

       sub_category                                              image  \
0  Air Conditioners  https://m.media-amazon.com/images/I/31UISB90sY...   
1  Air Conditioners  https://m.media-amazon.com/images/I/51JFb7FctD...   
2  Air Conditioners  https://m.media-amazon.com/images/I/51JFb7FctD...   
3  Air Conditioners  https://m.media-amazon.com/images/I/51JFb7FctD...   
4  Air Conditioners  https://m.media-amazon.com/images/I/41lrtqXPiW...   

                                                link ratings no_of_ratings  \
0  https://www.amazon.in/Lloyd

In [7]:
# Step 3: Load Amazon-C4 Dataset
amazon_c4 = load_dataset('McAuley-Lab/Amazon-C4')['test']
print("\nAmazon-C4 Data Loaded:")
print(amazon_c4)  # Display dataset info

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating test split: 100%|██████████| 21223/21223 [00:00<00:00, 33719.56 examples/s]


Amazon-C4 Data Loaded:
Dataset({
    features: ['qid', 'query', 'item_id', 'user_id', 'ori_rating', 'ori_review'],
    num_rows: 21223
})





In [24]:
# Step 4: Create Combined Text Column for Sales Data
amazon_sales_data['combined_text'] = amazon_sales_data.apply(lambda x: ' '.join(x.astype(str)), axis=1)


In [25]:
# For Amazon-C4 Dataset
amazon_c4_texts = amazon_c4['query']

In [26]:
# Step 6: Load the Sentence Transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')




In [27]:
# Step 5: Save Chunks of Combined Text
def create_chunks(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]

In [28]:
def save_chunks(data, chunk_size, base_filename):
    os.makedirs('chunks', exist_ok=True)  # Create a directory to save chunks
    for i, chunk in enumerate(create_chunks(data, chunk_size)):
        chunk_df = pd.DataFrame(chunk, columns=['combined_text'])  # Specify column name
        chunk_df.to_csv(f'chunks/{base_filename}_chunk_{i + 1}.csv', index=False)
    print(f"Chunks saved for {base_filename}.")

In [29]:
# Save chunks for Amazon Sales Dataset
save_chunks(amazon_sales_data[['combined_text']], chunk_size=100, base_filename='amazon_sales')  # Pass only combined_text column


Chunks saved for amazon_sales.


In [30]:
# Step 7: Load Previously Saved Chunks
def load_chunks(base_filename):
    chunk_files = glob.glob(f'chunks/{base_filename}_chunk_*.csv')
    chunk_dfs = [pd.read_csv(file) for file in chunk_files]
    return pd.concat(chunk_dfs, ignore_index=True)


In [None]:
# Load previously saved chunks
loaded_sales_chunks = load_chunks('amazon_sales')
print("\nLoaded Sales Chunks:")
print(loaded_sales_chunks.head())

In [23]:
# Step 8: Create Embeddings for Loaded Sales Chunks
loaded_sales_embeddings = []
for chunk in loaded_sales_chunks['combined_text'].tolist():
    embedding = embedding_model.encode(chunk)
    loaded_sales_embeddings.append(embedding)
    print("\nEmbeddings Created for Loaded Sales Chunks.")


Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embeddings Created for Loaded Sales Chunks.

Embedding

KeyboardInterrupt: 

In [None]:
# Step 9: Create Embeddings for Amazon-C4 Dataset
amazon_c4_embeddings = embedding_model.encode(amazon_c4['query'])


In [None]:
print("\nEmbeddings Created for Amazon-C4 Dataset.")