In [3]:
from google.colab import files
import io
import pandas as pd

uploaded = files.upload()
for fn in uploaded:
    df = pd.read_csv(io.BytesIO(uploaded[fn]), nrows=20000)
    break

print(df.columns)
print(df.head())


Saving final_extended_dataset_filtered.csv to final_extended_dataset_filtered (1).csv
Index(['Category', 'Subcategory', 'Type', 'Item_Name', 'Item_Variant',
       'Sales_Type', 'Source', 'Sales_Count', 'Revenue'],
      dtype='object')
  Category Subcategory          Type        Item_Name     Item_Variant  \
0    Sales   BEVERAGES  COFFEE BASED  Americano (Hot)  Americano (Hot)   
1    Sales   BEVERAGES  COFFEE BASED  Americano (Hot)  Americano (Hot)   
2    Sales   BEVERAGES  COFFEE BASED  Americano (Hot)  Americano (Hot)   
3    Sales   BEVERAGES  COFFEE BASED       Cappuccino       Cappuccino   
4    Sales   BEVERAGES  COFFEE BASED       Cappuccino       Cappuccino   

  Sales_Type  Source  Sales_Count  Revenue  
0  Ala Carte  EZO QS           18    22000  
1  Ala Carte     POS          428    22000  
2  Ala Carte    GRAB            1    24000  
3  Ala Carte  EZO QS           14    25000  
4  Ala Carte     POS          162    25000  


In [4]:
df = df.drop_duplicates().dropna()
df.columns = df.columns.str.strip().str.lower()
text_cols = ['item_name','item_variant','category','subcategory']
df['text'] = (
    df[text_cols]
      .astype(str)
      .agg(' '.join, axis=1)
      .str.lower()
)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
# X will be (n_items × n_features)
X = vectorizer.fit_transform(df['text']).toarray()


In [6]:
import tensorflow as tf
from tensorflow.keras import layers, Model
n_features    = X.shape[1]
encoding_dim  = 64

inp = layers.Input(shape=(n_features,))
h1  = layers.Dense(256, activation='relu')(inp)
bottleneck = layers.Dense(encoding_dim, activation='relu')(h1)

h2  = layers.Dense(256, activation='relu')(bottleneck)
out = layers.Dense(n_features, activation='sigmoid')(h2)

autoencoder = Model(inp, out)
encoder     = Model(inp, bottleneck)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X, X,
                epochs=20,
                batch_size=128,
                shuffle=True,
                verbose=1)

Epoch 1/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.1351
Epoch 2/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 3/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 4/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 5/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 6/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 7/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 8/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 9/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 10/20
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.0032
Epoch 11/

<keras.src.callbacks.history.History at 0x7aee9b9b8a10>

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

item_embeds = encoder.predict(X)

sim_matrix = cosine_similarity(item_embeds)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [13]:
def recommend(item_index, top_n=5):
    sims = sim_matrix[item_index].copy()
    sims[item_index] = -1
    nearest = sims.argsort()[::-1][:top_n]
    return df.iloc[nearest][text_cols]

print("Query item:", df.iloc[0][text_cols].to_list())
print("\nTop 5 similar:")
print(recommend(0, 5).to_string(index=False))

Query item: ['Americano (Hot)', 'Americano (Hot)', 'Sales', 'BEVERAGES']

Top 5 similar:
      item_name    item_variant category subcategory
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES


In [12]:
def recommend_from_different_categories(item_index, top_n=5):
    item_category = df.iloc[item_index]['category']

    sims = sim_matrix[item_index].copy()

    sims[item_index] = -1
    nearest = sims.argsort()[::-1][:top_n * 2]

    nearest_df = df.iloc[nearest]

    recommended_items = nearest_df[nearest_df['category'] != item_category]

    if len(recommended_items) < top_n:
        same_cat_items = nearest_df[nearest_df['category'] == item_category]
        recommended_items = pd.concat([recommended_items, same_cat_items], axis=0).head(top_n)
    else:
        recommended_items = recommended_items.head(top_n)

    return recommended_items[text_cols]

print("Query item:", df.iloc[0][text_cols].to_list())
print("\nTop 5 similar (from different categories):")
print(recommend_from_different_categories(0, 5).to_string(index=False))


Query item: ['Americano (Hot)', 'Americano (Hot)', 'Sales', 'BEVERAGES']

Top 5 similar (from different categories):
      item_name    item_variant category subcategory
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
Americano (Hot) Americano (Hot)    Sales   BEVERAGES
