In [1]:
import os 
from dotenv import load_dotenv
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from datasets import load_dataset

In [2]:
# dataset = load_dataset('ed-donner/pricer-data')
# len(dataset)

In [3]:
from items import Item
import pickle
with open('train.pkl','rb')as f:
    train_full = pickle.load(f)

In [4]:
len(train_full)

400000

In [5]:
train = train_full[:100_000]

In [6]:
len(train)

100000

In [7]:
DB = "product_vectorstore"
COLLECTION_NAME = "products"

In [8]:
import chromadb

client = chromadb.PersistentClient(DB)
collection = client.get_or_create_collection(COLLECTION_NAME)

In [None]:
from tqdm import tqdm

In [None]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') 

In [None]:
def description(item):
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n", "")
    return text.split("\n\nPrice is $")[0]

In [None]:
for i in tqdm(range(0,len(train),500)):
    docs = [description(t) for t in train[i:i+500]]
    vectors = encoder.encode(docs).astype(float).tolist()
    metadatas = [{'category':t.category,'price':t.price} for t in train[i:i+500]]
    ids = [f"doc_{j}" for j in range(i,i+500)]
    collection.add(
        ids=ids,
        documents = docs,
        embeddings=vectors,
        metadatas=metadatas
    )

In [9]:
collection.count()

100000

In [None]:
from Frontier_Agent import Frontier_Agent

In [None]:
f_agent = Frontier_Agent(collection)

In [None]:
f_agent.price("Iphone 6")

In [None]:
result = collection.get(include=['embeddings','documents','metadatas'])

In [None]:
vectors = result['embeddings']
prices = [m['price'] for m in result['metadatas']]
len(prices)

In [None]:
rf_model = RandomForestRegressor(n_estimators=50,random_state=42)

In [None]:
X = vectors[:50000]
Y = prices[:50000]

In [None]:
Y[0]

In [None]:
from tqdm import tqdm
tqdm(rf_model.fit(X,Y))

In [None]:
# import pickle
# with open('rf_model.pkl','wb')as f:
#     pickle.dump(rf_model,f)

In [None]:
with open('rf_model.pkl','rb')as f:
    rf_model = pickle.load(f)

In [41]:
test_prompt = train_full[399_999].prompt.replace("How much does this cost to the nearest dollar?\n\n","").split("Price is $")[0]

In [42]:
train_full[399_999].prompt


'How much does this cost to the nearest dollar?\n\n50 Pack Outlet Wall Plate Insulation Gasket Weatherproof Wall Gasket Replacement EVA Insulation Foam Gasket Sealer Outlet Insulation Pads Insulated Covers Plug Sealer Light Switch Socket Insulator\nFeatures Convenient and lightweight The wall outlet sealer is compact and lightweight without taking up much space, which can be stored at home, office or hotels for replacement needs, providing more convenience for you. Necessary home supplies The foam outlet insulators provide a great way to seal tiny cold or hot air that flows through wall outlets and switches, saving energy for your home. Specifications Material EVA Color white Size 63 x 107 mm/ 2.5 x 4.2 inches Package includes 50 x Light switch insulation gaskets Notes Manual measurement, please allow slight errors on size. The color\n\nPrice is $9.00'

In [43]:
from RandomForest_Agent import RandomForest_Agent 
from Frontier_Agent import Frontier_Agent
from LLAMA_Agent import LLAMA_Agent

In [45]:
rf_model = RandomForest_Agent()
f_agent = Frontier_Agent(collection)
ll_agent = LLAMA_Agent()

In [46]:
f_agent.price(test_prompt)

14.99

In [47]:
rf_model.price(test_prompt)

129.86499999999995

In [48]:
ll_agent.price(test_prompt)

19.0

In [None]:
!modal token new

In [None]:
!modal token set \
  --token-id ak-5gbwoP0kufrHiczqPvG9Lj \
  --token-secret as-1wGqW2nDcwDe6Vs5kFESrI
import locale
import os

# Check the default encoding
print(locale.getpreferredencoding())  # Should print 'UTF-8'

# Ensure UTF-8 encoding
os.environ['PYTHONIOENCODING'] = 'utf-8'


In [None]:
!modal deploy model_upload

In [None]:
import modal
PRICE_PREDICT = modal.Cls.lookup("Llama_Pricer","Price_Predict")
pricer_obj = PRICE_PREDICT()
pricer_obj.price.remote(test_prompt)

In [None]:
test_prompt

In [10]:
from LLAMA_Agent import LLAMA_Agent
from Frontier_Agent import Frontier_Agent
from RandomForest_Agent import RandomForest_Agent

rf_model = RandomForest_Agent()


In [11]:
fr_model = Frontier_Agent(collection)

In [12]:
ll_model = LLAMA_Agent()

In [17]:
llama_predictions = []
frontier_predictions = []
rf_predictions = []
actual_prices = []

In [15]:
len(train_full)

400000

In [16]:
def description(item):
    return item.prompt.split("to the nearest dollar?\n\n")[1].split("\n\nPrice is $")[0]

In [22]:
from tqdm import tqdm
for item in tqdm(train_full[300_000:300_250]):
    desc = description(item)
    llama_predictions.append(ll_model.price(desc))
    frontier_predictions.append(fr_model.price(desc))
    rf_predictions.append(rf_model.price(desc))
    actual_prices.append(item.price)
    

100%|████████████████████████████████████████████████████████████████████████████████| 250/250 [14:16<00:00,  3.43s/it]


In [23]:
len(actual_prices)

250

In [24]:
Mins = [min(l,f,r) for l,f,r in zip(llama_predictions,frontier_predictions,rf_predictions)]
Max = [max(l,f,r) for l,f,r in zip(llama_predictions,frontier_predictions,rf_predictions)]

In [25]:
len(Mins)

250

In [28]:
len(Max)


250

In [20]:
import pandas as pd

In [29]:
Y = pd.Series(actual_prices)
X = pd.DataFrame({
    "Llama":llama_predictions,
    "Frontier":frontier_predictions,
    "RandomForest":rf_predictions,
    "Mins":Mins,
    "Max":Max
}
)

In [30]:
lr_model = LinearRegression()

In [31]:
lr_model.fit(X,Y)

In [32]:
features = X.columns.tolist()

In [33]:
features

['Llama', 'Frontier', 'RandomForest', 'Mins', 'Max']

In [36]:
for f,c in zip(features,lr_model.coef_):
    print(f"{f} = {c}")
print(f"Intercept {lr_model.intercept_:.2f}")

Llama = 0.3296148524702103
Frontier = 0.8107245427600348
RandomForest = 0.26537447983215867
Mins = -0.2506674585414073
Max = -0.18906220590611827
Intercept 11.74


In [37]:
import joblib

In [38]:
# joblib.dump(lr_model,"lr_model.pkl")

['lr_model.pkl']

In [39]:
collection.count()

100000

In [49]:
from Ensemble_Agent import Ensemble_Agent

en_agent = Ensemble_Agent(collection)

In [50]:
en_agent.price(test_prompt)

36.31232344049136