In [1]:
# imports

import os
import re
import math
import json
from tqdm import tqdm
import random
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np
import pickle
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import chromadb
from items import Item
from testing import Tester
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# CONSTANTS

QUESTION = "How much does this cost to the nearest dollar?\n\n"
DB = "products_vectorstore"

In [4]:
!modal setup

⠋ Waiting for authentication in the web browser
The web browser should have opened for you to authenticate and get an API 
token.
If it didn't, please copy this URL into your web browser manually:

⠙ Waiting for authentication in the web browser
⠙ Waiting for authentication in the web browser
https://modal.com/token-flow/tf-yFTqu3poCKutD18mPE2xtl

⠙ Waiting for authentication in the web browser
⠙ Waiting for authentication in the web browser

⠋ Waiting for token flow to complete...
⠙ Waiting for token flow to complete...
⠹ Waiting for token flow to complete...
⠸ Waiting for token flow to complete...
⠼ Waiting for token flow to complete...
⠴ Waiting for token flow to complete...
⠧ Waiting for token flow to complete...
⠇ Waiting for token flow to complete...
⠏ Waiting for token flow to complete...
⠋ Waiting for token flow to complete...
⠙ Waiting for token flow to complete...
⠹ Waiting for token flow to complete...
⠼ Waiting for token flow to complete...
⠴ Waiting for token flow to compl

In [5]:
# environment
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [6]:
# Log in to HuggingFace

hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
# Another import after Logging in to Hugging Face - thank you Trung N.!

from items import Item

In [8]:
# Load in the test pickle file:

with open('test.pkl', 'rb') as file:
    test = pickle.load(file)

In [9]:
client = chromadb.PersistentClient(path=DB)
collection = client.get_or_create_collection('products')

In [10]:
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
prices = [metadata['price'] for metadata in result['metadatas']]

In [11]:
with open('random_forest_model.pkl', 'rb') as f:
    rf_model = pickle.load(f)

In [12]:
from agents.specialist_agent import SpecialistAgent
from agents.frontier_agent import FrontierAgent
from agents.random_forest_agent import RandomForestAgent


In [13]:
specialist = SpecialistAgent()
frontier = FrontierAgent(collection)
random_forest = RandomForestAgent()

In [14]:
def description(item):
    return item.prompt.split("to the nearest dollar?\n\n")[1].split("\n\nPrice is $")[0]

In [15]:
def rf(item):
    return random_forest.price(description(item))

In [16]:
product = 'Samsung Galaxy S20'

In [17]:
specialist = SpecialistAgent()
frontier = FrontierAgent(collection)
random_forest = RandomForestAgent()

In [18]:
print(specialist.price(product))
print(frontier.price(product))
print(random_forest.price(product))

350.0
432.91
203.8251000000001


In [19]:
specialists = []
frontiers = []
random_forests = []
prices = []
for item in tqdm(test[1000:1250]):
    text = description(item)
    specialists.append(specialist.price(text))
    frontiers.append(frontier.price(text))
    random_forests.append(random_forest.price(text))
    prices.append(item.price)

100%|██████████| 250/250 [14:28<00:00,  3.47s/it]


In [20]:
mins = [min(s,f,r) for s,f,r in zip(specialists, frontiers, random_forests)]
maxes = [max(s,f,r) for s,f,r in zip(specialists, frontiers, random_forests)]

X = pd.DataFrame({
    'Specialist': specialists,
    'Frontier': frontiers,
    'RandomForest': random_forests,
    'Min': mins,
    'Max': maxes,
})

# Convert y to a Series
y = pd.Series(prices)

In [21]:
# Train a Linear Regression
np.random.seed(42)

lr = LinearRegression()
lr.fit(X, y)

feature_columns = X.columns.tolist()

for feature, coef in zip(feature_columns, lr.coef_):
    print(f"{feature}: {coef:.2f}")
print(f"Intercept={lr.intercept_:.2f}")

Specialist: 0.41
Frontier: 0.15
RandomForest: -0.42
Min: 0.44
Max: 0.40
Intercept=23.39


In [23]:
import joblib
joblib.dump(lr, 'ensemble_model.pkl')

['ensemble_model.pkl']

In [24]:
from agents.ensemble_agent import EnsembleAgent
ensemble = EnsembleAgent(collection)

In [25]:
ensemble.price(product)

408.30262321234045

In [26]:
def ensemble_pricer(item):
    return max(0,ensemble.price(description(item)))