In [1]:
# Import libraries
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

import sys
import os

In [2]:
# Get the current working directory
current_dir = os.getcwd()
# Move up one level from the current directory
parent_dir = os.path.dirname(current_dir)
# Change directory into data directory
data_dir = os.path.join(parent_dir, 'data')

In [3]:
sys.path.append(parent_dir)

# Model

In [4]:
df = pd.read_csv(os.path.join(data_dir, 'products.txt'), delimiter='|', index_col=None)

In [5]:
df.head()

Unnamed: 0,item_desc,corp_item_brand_name,pim_item_class_desc,pim_item_sub_class_desc,state,flavor,pim_tasting_notes
0,-196 CKTL VOD A(DL/G/P) CAN 3/8PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,OTHER,
1,-196 CKTL VOD PEACH 12 CAN 6/4PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,PEACH,
2,-196 CKTL VOD DBL LEM 12 CAN 6/4PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,LEMON,Fresh lemon peel. Tart and light sweet with ch...
3,-196 CKTL VOD GRFRUIT 12 CAN 6/4P,-196,COCKTAILS,COCKTAILS-OTHER,TX,GRAPEFRUIT,
4,10 CANE RUM 80,10 CANE,RUM,GOLD RUM,TX,,


In [6]:
# Fill NaN values in the 'flavor' column with 'not applicable'
df['flavor'] = df['flavor'].fillna('not applicable')

In [17]:
# Add new columns and generate random data
df['alcohol_percentage'] = np.random.uniform(5, 15, size=len(df))  # Alcohol % between 5 and 15
df['sweetness_level'] = np.random.randint(1, 11, size=len(df))     # Sweetness level between 1 and 10
df['bitterness_level'] = np.random.randint(1, 11, size=len(df))    # Bitterness level between 1 and 10
df['acidity_level'] = np.random.randint(1, 11, size=len(df))       # Acidity level between 1 and 10
df['tannin_level'] = np.random.randint(1, 11, size=len(df))        # Tannin level between 1 and 10
df['body'] = np.random.choice(['Light', 'Medium', 'Full'], size=len(df))  # Body type
df['serving_temperature'] = np.random.uniform(40, 65, size=len(df))  # Temperature in °F
df['vintage_year'] = np.random.randint(2000, 2023, size=len(df))     # Random vintage year
df['grape_variety'] = np.random.choice(['Chardonnay', 'Cabernet Sauvignon', 'Merlot'], size=len(df))  # Grape variety
df['region'] = np.random.choice(['Napa Valley', 'Sonoma', 'Rioja', 'Tuscany'], size=len(df))  # Wine region
df['price'] = np.random.uniform(10, 100, size=len(df))  # Price in USD
df['food_pairing'] = np.random.choice(['Cheese', 'Grilled Chicken', 'Pasta'], size=len(df))  # Suggested food pairing
df['aroma'] = np.random.choice(['Fruity', 'Floral', 'Spicy'], size=len(df))  # Aroma profile
df['mouthfeel'] = np.random.choice(['Silky', 'Velvety', 'Rough'], size=len(df))  # Mouthfeel
df['finish'] = np.random.choice(['Short', 'Medium', 'Long'], size=len(df))  # Finish length

In [18]:
df.columns

Index(['item_desc', 'corp_item_brand_name', 'pim_item_class_desc',
       'pim_item_sub_class_desc', 'state', 'flavor', 'pim_tasting_notes',
       'alcohol_percentage', 'sweetness_level', 'bitterness_level',
       'acidity_level', 'tannin_level', 'body', 'serving_temperature',
       'vintage_year', 'grape_variety', 'region', 'price', 'food_pairing',
       'aroma', 'mouthfeel', 'finish'],
      dtype='object')

In [19]:
df.dtypes

item_desc                   object
corp_item_brand_name        object
pim_item_class_desc         object
pim_item_sub_class_desc     object
state                       object
flavor                      object
pim_tasting_notes           object
alcohol_percentage         float64
sweetness_level              int32
bitterness_level             int32
acidity_level                int32
tannin_level                 int32
body                        object
serving_temperature        float64
vintage_year                 int32
grape_variety               object
region                      object
price                      float64
food_pairing                object
aroma                       object
mouthfeel                   object
finish                      object
dtype: object

In [20]:
df.head()

Unnamed: 0,item_desc,corp_item_brand_name,pim_item_class_desc,pim_item_sub_class_desc,state,flavor,pim_tasting_notes,alcohol_percentage,sweetness_level,bitterness_level,...,body,serving_temperature,vintage_year,grape_variety,region,price,food_pairing,aroma,mouthfeel,finish
0,-196 CKTL VOD A(DL/G/P) CAN 3/8PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,OTHER,,13.41717,10,3,...,Full,42.93,2008,Chardonnay,Tuscany,53.103866,Pasta,Fruity,Rough,Long
1,-196 CKTL VOD PEACH 12 CAN 6/4PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,PEACH,,8.623515,9,3,...,Medium,59.049843,2018,Cabernet Sauvignon,Sonoma,94.475113,Pasta,Floral,Silky,Long
2,-196 CKTL VOD DBL LEM 12 CAN 6/4PK,-196,COCKTAILS,COCKTAILS-OTHER,TX,LEMON,Fresh lemon peel. Tart and light sweet with ch...,10.249378,2,8,...,Medium,49.068042,2003,Cabernet Sauvignon,Sonoma,85.019099,Cheese,Spicy,Silky,Medium
3,-196 CKTL VOD GRFRUIT 12 CAN 6/4P,-196,COCKTAILS,COCKTAILS-OTHER,TX,GRAPEFRUIT,,6.012468,5,4,...,Medium,51.566561,2013,Cabernet Sauvignon,Rioja,87.519005,Cheese,Spicy,Silky,Short
4,10 CANE RUM 80,10 CANE,RUM,GOLD RUM,TX,,,10.071682,5,8,...,Full,61.513052,2016,Merlot,Napa Valley,89.105149,Pasta,Fruity,Rough,Short


In [21]:
def recommend_wines(df, item_desc):
    # Define the columns to be used for similarity calculation
    features = ['alcohol_percentage', 'sweetness_level', 'bitterness_level', 'acidity_level', 'tannin_level', 'serving_temperature', 'price']
    
    # Handle missing values by filling with the mean of each column
    df[features] = df[features].fillna(df[features].mean())
    
    # Standardize the features
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df[features])
    
    # Get the index of the item_desc
    idx = df.index[df['item_desc'] == item_desc].tolist()
    
    if not idx:
        return "Item description not found."
    
    idx = idx[0]
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(df_scaled)
    
    # Get similarity scores for the selected item
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the wines by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top 3 similar wines
    top_indices = [i[0] for i in sim_scores[1:4]]
    
    # Return the top 3 similar wines
    recommendations = df.iloc[top_indices]
    
    return recommendations[['item_desc', 'corp_item_brand_name', 'pim_item_class_desc', 'pim_item_sub_class_desc', 'flavor', 'price']]

In [27]:
# Assuming df is your DataFrame and data_dir is your directory path
df.to_csv(os.path.join(data_dir, 'products.csv'), index=False)


In [22]:
result = recommend_wines(df, '-196 CKTL VOD DBL LEM 12 CAN 6/4PK')
print(result)

                               item_desc corp_item_brand_name  \
9950   LARCENY BBN VERY SP SB(6696086)92              LARCENY   
11557  NIKKA WHISKY SMALT YOICHI(2019)96                NIKKA   
8536   IRONROOT BBN HARBINGER(#508)121.8    IRONROOT REPUBLIC   

      pim_item_class_desc pim_item_sub_class_desc flavor      price  
9950     WHISKEY-AMERICAN        STRAIGHT BOURBON    NaN  79.907668  
11557       WHISKEY-OTHER      SINGLE MALT WHISKY    NaN  84.752384  
8536     WHISKEY-AMERICAN                 BOURBON    NaN  97.047430  
