In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
import xgboost as xgb


In [None]:
# CQI data (from web scraping)
cqi_df = pd.read_csv('data/cqi_5_23.csv')

# Review data (from database)
rev_df = pd.read_csv('data/rev_5_23.csv')

# Columns that are missing from rev_df. These are the columns that we will be predicting.
FEATURE_COLUMNS = ["Aroma", "Flavor", "Aftertaste", "Acidity", "Body"]
MISSING_COLUMNS = ["Balance", "Uniformity", "Clean Cup", "Sweetness"]

MODEL_NAME = "TaylorAI/gte-tiny"

In [None]:
# Rename columns 'aroma','sour','body','flavor','aftertaste' in Aroma, Acidty, Body, Flavor, Aftertaste
rev_df.rename(
    columns={
        "aroma": "Aroma",
        "acid": "Acidity",
        "body": "Body",
        "flavor": "Flavor",
        "aftertaste": "Aftertaste",
    },
    inplace=True,
)

rev_df.columns

In [None]:
# Predict missing columns
X = cqi_df[["Aroma", "Flavor", "Aftertaste", "Acidity", "Body"]]
y = cqi_df[["Balance", "Uniformity", "Clean Cup", "Sweetness"]]

# XGBoost
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, multi_strategy="one_output_per_tree")
# Look ma, no train-test split!
xgb_model.fit(X, y)

In [None]:
# Predict missing values
rev_df[MISSING_COLUMNS] = xgb_model.predict(rev_df[["Aroma", "Flavor", "Aftertaste", "Acidity", "Body"]])

In [None]:
# Embedding model

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained(f'{MODEL_NAME}')
model = AutoModel.from_pretrained(f'{MODEL_NAME}')



In [None]:
description_columns = ['desc_1', 'desc_2', 'desc_3']
embeddings = []
for desc_col in description_columns:
    rev_df[desc_col] = rev_df[desc_col].fillna('')
    encoded_input = tokenizer(rev_df[desc_col].to_list(), padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings.append(mean_pooling(model_output, encoded_input['attention_mask']))
stacked_embeddings = torch.hstack(embeddings).shape
embeddings_reduced = PCA(n_components=0.67).fit_transform(stacked_embeddings)


In [None]:
# Combine all features
shared_ls = np.hstack([rev_df[FEATURE_COLUMNS+MISSING_COLUMNS].to_numpy(), embeddings_reduced]).shape

In [None]:
# Predict rating
X = np.hstack([rev_df[FEATURE_COLUMNS+MISSING_COLUMNS].to_numpy(), embeddings_reduced])
y = rev_df['rating']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# XGBoost
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate
xgb_model.score(X_test, y_test)
