In [21]:
#https://nlp.stanford.edu/data/glove.6B.zip

In [22]:
import os
import joblib
import numpy as np
from sklearn.cluster import KMeans

In [23]:
initial_data = {
    "food": [
        "Pizza", "Burger", "Sushi", "Pasta", "Salad", "Steak", "Tacos", "Ramen", "Sandwich", "Ice Cream",
        "Chocolate", "Cheese", "Fried Chicken", "Seafood", "Hot Dog", "Bread", "Soup", "Rice", "Fruit",
        "Vegetables", "Pancakes", "Waffles", "Cereal", "Yogurt", "Smoothie", "Cookies", "Brownies",
        "Doughnuts", "Chips", "Popcorn", "Noodles", "Quinoa", "Lentils", "Beans", "Avocado", "Nuggets",
        "BBQ Ribs", "Omelette", "Sausage", "Fish", "Shrimp", "Lobster", "Curry", "Tortilla", "Muffins",
        "Bagels", "Granola", "Pies", "Cakes"
    ],

    "enternaiment": [
        "Movies", "Series", "Video Games", "Board Games", "Podcasts", "Music", "Concerts", "Theater",
        "Live Shows", "Stand-up Comedy", "Streaming", "YouTube", "Sports Events", "Hiking", "Camping",
        "Traveling", "Reading", "Dancing", "Karaoke", "Bowling", "Arcade Games", "Escape Rooms",
        "Theme Parks", "Museums", "Art Galleries", "Photography", "Painting", "Drawing", "Writing",
        "Fishing", "Swimming", "Cycling", "Running", "Gym", "Yoga", "Meditation", "Cooking", "Baking",
        "Crafting", "Watching Anime", "Listening to Audiobooks", "VR Gaming", "Magic Shows",
        "Trivia Nights", "Festivals", "Shopping", "Skating", "Surfing", "Concert Streaming"
    ],

    "investiments": [
        "Stocks", "Bonds", "ETFs", "Index Funds", "Real Estate", "REITs", "Cryptocurrency", "Gold",
        "Silver", "Mutual Funds", "Commodities", "Treasury Bills", "High-Yield Savings", "Options",
        "Futures", "Startups", "Angel Investing", "Private Equity", "Venture Capital", "Foreign Exchange",
        "Certificates of Deposit", "Annuities", "Retirement Funds", "Pension Plans", "401k", "IRA",
        "Roth IRA", "Precious Metals", "Art Investment", "Wine Investment", "Collectibles", "Crowdfunding",
        "Peer-to-Peer Lending", "Green Bonds", "Hedge Funds", "Agriculture Funds", "Infrastructure Funds",
        "Dividend Stocks", "Growth Stocks", "Blue Chip Stocks", "Index Tracking", "Emerging Markets",
        "Small Cap Stocks", "Mid Cap Stocks", "Large Cap Stocks", "Biotech Stocks", "Energy Funds",
        "Tech Companies", "Credit Funds"
    ]
}


In [24]:
categories = ['enternaiment', 'investiments', 'food']

In [25]:
embedding_dim = 50

In [26]:
def load_glove_embeddings(glove_file):
    glove_embeddings = {}

    if not os.path.exists(glove_file):
        raise FileNotFoundError(f'Arquivo n√£o encontrado: {glove_file}')
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove_embeddings[word] = vector

    print(f'Carregado {len(glove_embeddings)} vetores de palavras')
    
    return glove_embeddings

In [27]:
glove_embeddings = load_glove_embeddings('glove.6B.50d.txt')

Carregado 400000 vetores de palavras


In [28]:
glove_embeddings['test']

array([ 0.13175 , -0.25517 , -0.067915,  0.26193 , -0.26155 ,  0.23569 ,
        0.13077 , -0.011801,  1.7659  ,  0.20781 ,  0.26198 , -0.16428 ,
       -0.84642 ,  0.020094,  0.070176,  0.39778 ,  0.15278 , -0.20213 ,
       -1.6184  , -0.54327 , -0.17856 ,  0.53894 ,  0.49868 , -0.10171 ,
        0.66265 , -1.7051  ,  0.057193, -0.32405 , -0.66835 ,  0.26654 ,
        2.842   ,  0.26844 , -0.59537 , -0.5004  ,  1.5199  ,  0.039641,
        1.6659  ,  0.99758 , -0.5597  , -0.70493 , -0.0309  , -0.28302 ,
       -0.13564 ,  0.6429  ,  0.41491 ,  1.2362  ,  0.76587 ,  0.97798 ,
        0.58507 , -0.30176 ], dtype=float32)

In [None]:
def generate_embeddings(sentence):
    words = sentence.lower().split()

    valid_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]
    
    if valid_vectors:
        return np.mean(valid_vectors, axis=0).astype(np.float64)
    else:
        return np.zeros(embedding_dim, dtype=np.float64)

In [30]:
sentence = 'I Like'
print(generate_embeddings(sentence))

[ 0.24349499  0.180445   -0.15263149 -0.3475785   0.480075   -0.104065
 -0.54067999 -0.12663999 -0.66784501  0.25340879 -0.19722     0.77772498
 -0.70659    -0.06953     0.88318002  0.49352497  0.28656     0.47338998
 -0.0644935  -0.83103001 -0.50853497  0.534365    0.55148     0.32932502
  0.73118502 -2.16694999 -1.30735004  0.38081998  0.78483003 -1.10350001
  3.35450006  0.63190502 -0.35066849  0.14355101 -0.170057   -0.079615
  0.01633999  0.39409     0.387045   -0.172795   -0.14114714  0.38961101
 -0.1396786   0.50398499  0.46076301  0.31224    -0.17144001 -0.62373996
 -0.3319      0.58652002]


In [31]:
def train_model(categories):
    kmeans = KMeans(n_clusters=len(categories), random_state=42)

    initial_descriptions = []

    for _, examples in initial_data.items():
        initial_descriptions.extend(examples)
    
    embeddings = np.array([ generate_embeddings(desc) for desc in initial_descriptions ])

    kmeans.fit(embeddings)
    return kmeans

In [32]:
kmeans = train_model(categories)

In [33]:
test = glove_embeddings['banana']
print(kmeans.predict([test]))

[2]


In [34]:
def test_cluster_assigments(model, descriptions):
    print('Cluster Assigments:')
    for description in descriptions:
        embedding = generate_embeddings(description)

        cluster = model.predict([embedding])[0]
        print(model.predict([embedding]))

        print(f'{description} no Cluster: {cluster} - {categories[cluster]}')

In [35]:
categories

['enternaiment', 'investiments', 'food']

In [36]:
glove_embeddings['pizza']

array([ 0.62143  , -0.49645  , -0.69599  ,  0.19473  ,  1.0616   ,
        0.14227  , -0.79524  ,  0.19404  ,  0.0071687,  0.14646  ,
       -0.61261  ,  0.0037311,  0.41935  ,  1.0381   ,  0.16911  ,
       -0.53342  , -0.52508  ,  0.79629  , -0.029128 , -0.44912  ,
        1.0138   , -0.59214  ,  0.17643  ,  1.5506   , -0.96916  ,
       -0.42896  , -0.92664  ,  0.43301  ,  1.2915   , -0.80836  ,
        1.336    ,  0.24572  , -0.11799  ,  2.0015   , -0.27431  ,
        0.17803  , -0.31508  ,  0.84582  ,  0.77419  ,  0.45243  ,
        0.73485  ,  0.44473  , -0.77466  ,  0.43471  ,  0.53486  ,
        0.98216  , -0.70731  , -0.48557  ,  0.16453  ,  0.65013  ],
      dtype=float32)

In [37]:
test = glove_embeddings['pizza']

generated = generate_embeddings('pizza')
print(test, '\n', generated)
print(test - generated)

# print(kmeans.predict([test]))

[ 0.62143   -0.49645   -0.69599    0.19473    1.0616     0.14227
 -0.79524    0.19404    0.0071687  0.14646   -0.61261    0.0037311
  0.41935    1.0381     0.16911   -0.53342   -0.52508    0.79629
 -0.029128  -0.44912    1.0138    -0.59214    0.17643    1.5506
 -0.96916   -0.42896   -0.92664    0.43301    1.2915    -0.80836
  1.336      0.24572   -0.11799    2.0015    -0.27431    0.17803
 -0.31508    0.84582    0.77419    0.45243    0.73485    0.44473
 -0.77466    0.43471    0.53486    0.98216   -0.70731   -0.48557
  0.16453    0.65013  ] 
 [ 0.62142998 -0.49645001 -0.69599003  0.19473     1.06159997  0.14227
 -0.79523998  0.19404     0.0071687   0.14646    -0.61260998  0.0037311
  0.41935     1.0381      0.16911    -0.53342003 -0.52508003  0.79628998
 -0.029128   -0.44911999  1.01380002 -0.59214002  0.17643     1.55060005
 -0.96916002 -0.42896    -0.92663997  0.43301001  1.29149997 -0.80835998
  1.33599997  0.24572    -0.11799     2.00149989 -0.27430999  0.17803
 -0.31507999  0.845820

In [38]:
categories

['enternaiment', 'investiments', 'food']

In [43]:
test_descriptions = [
    'Pizza', 'banana', 'concert', 'savings'
]

test_descriptions = [
    'I eat Pizza', 'i play in concert', 'savings for my payroll'
]

test_cluster_assigments(kmeans, test_descriptions)

Cluster Assigments:
[2]
I eat Pizza no Cluster: 2 - food
[0]
i play in concert no Cluster: 0 - enternaiment
[1]
savings for my payroll no Cluster: 1 - investiments


In [44]:
joblib.dump(kmeans, 'kmeans_model.pkl')

['kmeans_model.pkl']

In [60]:
import pandas as pd

df = pd.DataFrame([
{
    "Description": "i go to music",
    "Value": 1.0,
    "Category": "enternaiment"
},
{
    "Description": "i invest my payroll",
    "Value": 1.0,
    "Category": "investments"
},
{
    "Description": "i eat banana",
    "Value": 1.0,
    "Category": "food"
},
{
    "Description": "i eat apple",
    "Value": 1.0,
    "Category": "food"
}])

df.reset_index(drop=True)


Unnamed: 0,Description,Value,Category
0,i go to music,1.0,enternaiment
1,i invest my payroll,1.0,investments
2,i eat banana,1.0,food
3,i eat apple,1.0,food


In [70]:
df.groupby('Category')['Description'].apply(list)

Category
enternaiment                [i go to music]
food            [i eat banana, i eat apple]
investments           [i invest my payroll]
Name: Description, dtype: object

In [68]:
df.groupby('Category').agg({
    'Description': 'count'
})

Unnamed: 0_level_0,Description
Category,Unnamed: 1_level_1
enternaiment,1
food,2
investments,1
