In [112]:
from flask import json
import pandas as pd
import gzip

import os
import sys
import random
from google.colab import drive

import re
import nltk
from nltk.corpus import words

import torch
from torch.utils.data import Dataset, DataLoader

In [113]:
nltk.download('words')
english_words = set(words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [114]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [115]:
drive.mount('/content/drive')
drive_folder = '/content/drive/Shared drives/CS_685_Project/PantryData'
%cd {drive_folder}

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/Shared drives/CS_685_Project/PantryData


In [116]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df_data = getDF('Prime_Pantry.json.gz')
df_meta = getDF('meta_Prime_Pantry.json.gz')

In [117]:
print(df_data.columns)
print(df_meta.columns)

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image', 'style'],
      dtype='object')
Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'],
      dtype='object')


In [118]:
dfd = df_data.copy()
dfm = df_meta.copy()

In [119]:
dfd = dfd[dfd['verified'] == True]

In [120]:
dfd.drop(columns=['overall', 'verified', 'reviewTime', 'style', 'vote',
                  'reviewerName', 'reviewText', 'unixReviewTime', 'image', 'summary'], inplace=True)
dfm.drop(columns=['tech1', 'fit', 'also_buy', 'tech2',
                  'date', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
                  'price', 'imageURL', 'imageURLHighRes', 'details', 'brand', 'category', 'description'], inplace=True)

In [121]:
print(dfd.columns)
print(dfm.columns)

Index(['reviewerID', 'asin'], dtype='object')
Index(['title', 'asin'], dtype='object')


In [122]:
reviewer_counts = dfd['reviewerID'].value_counts()
reviewer_ids_at_least_5 = reviewer_counts[reviewer_counts >= 5].index
dfd = dfd[dfd['reviewerID'].isin(reviewer_ids_at_least_5)]

In [123]:
dfd.reset_index(inplace=True)

In [124]:
df = pd.merge(dfd, dfm, on='asin', how='inner')

In [125]:
dff = df.drop(columns=['index'])

In [126]:
def clean_text(text):
    if not isinstance(text, str):
          text = str(text)
    # Remove special characters except for '.', ',', and '?'
    cleaned_text = re.sub(r'[^\w\s\.,\?]|(?<=\s)\w(?=\s)', ' ', text)
    # Remove words containing digits or underscores
    cleaned_text = re.sub(r'\b\w*[\d_]+\w*\b', '', cleaned_text)
    # Remove specified meaningless words
    cleaned_text = re.sub(r'\bmisspelled\b|\bmeaningless\b|\bwordss\b|\bsymbols\b', ' ', cleaned_text, flags=re.IGNORECASE)
    # Filter out non-English words
    cleaned_text = ' '.join(word for word in cleaned_text.split() if word.lower() in english_words)
    # Remove single characters surrounded by space
    cleaned_text = re.sub(r'\b\w\b', ' ', cleaned_text)
    # Remove extra whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text.strip()

columns_to_clean = ['title']
for column in columns_to_clean:
    dff[column] = dff[column].apply(clean_text)

In [127]:
grouped_df = dff.groupby('reviewerID').apply(lambda x: {
    'data': {
        'total_asin': len(x),
        **{
            str(i + 1): x.iloc[i, 1:].to_dict()  # asin and the rest of the columns
            for i in range(len(x))
        }
    }
}).reset_index()

print(grouped_df)

                 reviewerID                                                  0
0      A09677047K94TJDE6FN3  {'data': {'total_asin': 6, '1': {'asin': 'B000...
1            A1006B7W9YFPL6  {'data': {'total_asin': 7, '1': {'asin': 'B000...
2            A100RH4M1W1DF0  {'data': {'total_asin': 53, '1': {'asin': 'B00...
3            A100V5QEICGPDA  {'data': {'total_asin': 6, '1': {'asin': 'B00I...
4            A100WO06OQR8BQ  {'data': {'total_asin': 15, '1': {'asin': 'B00...
...                     ...                                                ...
12390         AZY157FF14CSL  {'data': {'total_asin': 23, '1': {'asin': 'B00...
12391         AZYAKYI8M22TW  {'data': {'total_asin': 5, '1': {'asin': 'B000...
12392         AZYYC0U8YLW1Q  {'data': {'total_asin': 8, '1': {'asin': 'B00I...
12393         AZZINCU7WSW0T  {'data': {'total_asin': 6, '1': {'asin': 'B000...
12394         AZZTYBMA9JHTC  {'data': {'total_asin': 6, '1': {'asin': 'B00I...

[12395 rows x 2 columns]


In [128]:
new_column_names = ['ID', 'info']
grouped_df.columns = new_column_names
print(grouped_df.iloc[0])

ID                                   A09677047K94TJDE6FN3
info    {'data': {'total_asin': 6, '1': {'asin': 'B000...
Name: 0, dtype: object


In [129]:
grouped_df = grouped_df[grouped_df['info'].apply(lambda x: '4' in x['data'] and '5' in x['data'])]

In [130]:
grouped_df['item_1_title'] = grouped_df['info'].apply(lambda x: x['data']['1']['title'])
grouped_df['item_2_title'] = grouped_df['info'].apply(lambda x: x['data']['2']['title'])
grouped_df['item_3_title'] = grouped_df['info'].apply(lambda x: x['data']['3']['title'])
grouped_df['item_4_title'] = grouped_df['info'].apply(lambda x: x['data']['4']['title'])
grouped_df['item_5_title'] = grouped_df['info'].apply(lambda x: x['data']['5']['title'])

In [131]:
# Creating the prompts column
grouped_df['prompts'] = (
    "Reviewer has bought " +
    grouped_df['item_1_title'] + ", " +
    grouped_df['item_2_title'] + ", " +
    grouped_df['item_3_title'] + ", " +
    grouped_df['item_4_title'] +
    ". Considering their purchase history, which product will they buy next from the below options?"
)

# Creating the outputs column
grouped_df['outputs'] = grouped_df['item_5_title']

In [132]:
dfm.columns

Index(['title', 'asin'], dtype='object')

In [133]:
def get_titles(asins):
    titles = []
    for asin in asins:
        title = dfm.loc[dfm['asin'] == asin, 'title'].iloc[0]
        titles.append(title)
    return titles

def create_additional_columns(row):
    total_asin = row['info']['data']['total_asin']
    titles = []
    additional_asin = dfm[~dfm['asin'].isin([row['info']['data'][str(i)]['asin'] for i in range(1, total_asin + 1)])]['asin'].tolist()
    random.shuffle(additional_asin)
    random_items = random.sample(additional_asin, 3)
    additional_titles = get_titles(random_items)
    titles.extend(additional_titles)
    return pd.Series(titles[:3], index=['item_1_out', 'item_2_out', 'item_3_out'])

grouped_df[['item_1_out', 'item_2_out', 'item_3_out']] = grouped_df.apply(create_additional_columns, axis=1)

In [135]:
def generate_prompt(row):
    items = [row['outputs'], row['item_1_out'], row['item_2_out'], row['item_3_out']]
    random.shuffle(items)
    prompt = f" A) {items[0]}, B) {items[1]}, C) {items[2]}, D) {items[3]}."
    return prompt

grouped_df['prompts_part2'] = grouped_df.apply(generate_prompt, axis=1)

In [136]:
grouped_df.columns

Index(['ID', 'info', 'item_1_title', 'item_2_title', 'item_3_title',
       'item_4_title', 'item_5_title', 'prompts', 'outputs', 'item_1_out',
       'item_2_out', 'item_3_out', 'prompts_part2'],
      dtype='object')

In [137]:
grouped_df['prompts'] = grouped_df['prompts'].str.cat(grouped_df['prompts_part2'], sep=' ')

In [138]:
# Creating the new DataFrame LLM_data with prompts and outputs columns
LLM_data = grouped_df[['prompts', 'outputs']]

# Displaying the resulting DataFrame
print(LLM_data)

                                                 prompts  \
0      Reviewer has bought Liquid Hand, Band Aid Bran...   
1      Reviewer has bought acute Clean Day Hand Lemon...   
2      Reviewer has bought Pocky Chocolate, Suave in ...   
3      Reviewer has bought Hunt Traditional, Ounce, B...   
4      Reviewer has bought acute Clean Day Hand Lemon...   
...                                                  ...   
12390  Reviewer has bought Original Sloppy Joe, Four,...   
12391  Reviewer has bought Scotch Non Scratch Scrub, ...   
12392  Reviewer has bought Fabric Extra, Always Prote...   
12393  Reviewer has bought Almond Breeze Dairy Free F...   
12394  Reviewer has bought Herbal Hello Hydration Sha...   

                                                 outputs  
0                                         Original Ounce  
1      Big Roll Toilet Paper Pack of Bath Ultra Soft ...  
2                                                 Orange  
3                                          

In [139]:
LLM_data.to_csv('final_dataset.csv', index=False)

In [19]:
'''# Create lists to store prompts and outputs
prompts = []
outputs = []

# Iterate over each row in the DataFrame
for index, row in grouped_df.iterrows():
    # Extract information from the 'info' dictionary
    info_dict = row['info']['data']

    # Calculate the number of products to include in the prompt
    num_products_prompt = int(0.7 * info_dict['total_asin'])

    # Initialize variable to keep track of prompt length
    prompt_length = 0

    # Construct prompts for the first 'num_products_prompt' items
    prompt = ""
    for i in range(1, min(num_products_prompt + 1, info_dict['total_asin'] + 1)):
        item_info = info_dict[str(i)]
        item_title = item_info['title']
        item_description = item_info['description']

        # Calculate length of current item information
        item_length = len(f"Reviewer has bought [{item_title}], whose description is [{item_description}]. ")

        # Check if adding current item information exceeds sequence length limit
        if prompt_length + item_length > 4096:
            print(item_length)
            break

        # Add current item information to prompt
        prompt += f"Reviewer has bought [{item_title}], whose description is [{item_description}]. "
        prompt_length += item_length

    prompt += "Considering their purchase history, what product will they buy next?"
    prompts.append(prompt)

    # Construct output string with titles of remaining items
    output = ", ".join([info_dict[str(i)]['title'] for i in range(num_products_prompt + 1, info_dict['total_asin'] + 1)])
    outputs.append(output)

# Create a new DataFrame with prompts and outputs
new_df = pd.DataFrame({'prompts': prompts, 'outputs': outputs})

# Print the first row value of the new DataFrame
print(new_df.iloc[0])'''

55
275
574
520
1145
1869
1532
1710
615
2475
1539
445
1814
442
999
339
847
1961
711
1729
1406
585
1500
1572
644
1311
288
2225
170
332
951
1157
811
1500
3458
1354
1164
1554
1141
669
2475
2546
2126
1411
1710
687
316
2116
3174
1123
1353
1282
1662
377
1826
1218
2927
1463
1783
2350
1699
631
591
1054
232
1532
1290
500
1353
2598
1829
2055
1211
2276
1411
1277
445
586
2462
373
447
1831
722
591
1747
978
915
1711
684
2167
1500
1861
534
2630
416
1798
1030
666
2225
209
1667
1986
1348
505
1263
785
1290
210
956
357
723
1054
914
775
357
2324
811
2274
1454
978
320
633
2475
1273
290
1710
1411
1711
1829
408
546
877
1523
2119
1314
1083
871
243
730
998
758
2167
2598
1783
1543
581
1783
515
1079
1986
978
359
2922
1410
1440
1829
1054
274
767
785
1951
1010
1018
941
1525
598
1017
969
562
487
1986
2122
418
372
1710
1612
1522
189
1265
2194
293
1585
1142
1427
1806
1277
1667
1554
440
1464
239
470
706
1427
1967
278
1539
1710
1158
990
1098
1667
722
2119
2819
2119
138
1353
1715
932
378
709
1841
1523
830
2006
1500
1354


In [21]:
'''
class CustomDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.data = self.load_data()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

    def load_data(self):
        data = []
        with gzip.open(self.path, 'rb') as file:
            for line in file:
                data.append(json.loads(line))
        return data

def load_data_in_batches(path, batch_size):
    dataset = CustomDataset(path)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    batches = []
    for batch in data_loader:
        batches.append(batch)
    return torch.cat(batches)

# Define batch size
batch_size = 64

# Load data in batches
tensor_data = load_data_in_batches('Sports_and_Outdoors.json.gz', batch_size)
tensor_meta = load_data_in_batches('meta_Sports_and_Outdoors.json.gz', batch_size)

# Save tensors to disk
torch.save(tensor_data, 'tensor_data.pt')
torch.save(tensor_meta, 'tensor_meta.pt')

# Load tensors from disk
#tensor_data_df = torch.load('tensor_data.pt')
#tensor_meta_df = torch.load('tensor_meta.pt')'''

'''
df_data = pd.read_json('Sports_and_Outdoors.json.gz', lines=True)
df_meta = pd.read_json('meta_Sports_and_Outdoors.json.gz', lines=True)'''

"\ndf_data = pd.read_json('Sports_and_Outdoors.json.gz', lines=True)\ndf_meta = pd.read_json('meta_Sports_and_Outdoors.json.gz', lines=True)"