In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [2]:
documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

In [3]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

In [4]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

In [5]:
print("Top terms per cluster:")

Top terms per cluster:


In [6]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
order_centroids

array([[12, 11, 19,  1, 13, 14, 31, 15,  9,  6, 25, 22, 10, 27, 29,  0,
         7,  5,  8,  4,  3,  2, 32, 17, 18, 20, 21, 23, 24, 26, 28, 30,
        16],
       [ 5,  3,  7, 21, 32, 23, 30,  2, 20, 16, 28, 18, 17,  4, 26, 24,
         8, 22,  9,  1, 29, 27,  6, 11, 10, 12, 13, 14, 15, 31, 19, 25,
         0]])

In [10]:
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f" {terms[ind]}")
        
terms

Cluster 0:
 google
 feedback
 map
 app
 impressed
 incredible
 translate
 key
 extension
 chrome
Cluster 1:
 cat
 best
 climbing
 ninja
 ve
 photo
 taken
 belly
 merley
 kitten


array(['100', 'app', 'belly', 'best', 'came', 'cat', 'chrome', 'climbing',
       'eating', 'extension', 'face', 'feedback', 'google', 'impressed',
       'incredible', 'key', 'kitten', 'kitty', 'little', 'map', 'merley',
       'ninja', 'open', 'photo', 'play', 'promoter', 'restaurant',
       'smiley', 'squooshy', 'tab', 'taken', 'translate', 've'],
      dtype=object)

In [11]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print('------------------------------------------')

Cluster 0:
 google
 feedback
 map
 app
 impressed
 incredible
 translate
 key
 extension
 chrome
------------------------------------------
Cluster 1:
 cat
 best
 climbing
 ninja
 ve
 photo
 taken
 belly
 merley
 kitten
------------------------------------------


In [12]:
Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

[0]


In [13]:
Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)


[1]


In [14]:
documents = ['ذهبت الي المطعم البارحة',
            'انقطع عني اتصال الانترنت',
             'المطعم يقدم وجبات سيئة',
             'الانترنت اصبح لغة العصر',
             'وجبات المطعم مستواها ردئ' ,
             'اتصال الانترنت لازال منقطع']

In [15]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

In [16]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

In [None]:
print("Top terms per cluster:")

Top terms per cluster:


In [18]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
order_centroids

array([[ 5, 16, 17,  9, 14, 10,  3,  6,  8, 11,  7, 12, 13,  4, 15,  2,
         1,  0],
       [ 2,  0, 15, 12, 11,  7,  1, 13,  4,  5,  3, 17,  6, 16,  9, 10,
        14,  8]])

In [23]:
terms = vectorizer.get_feature_names_out()
terms

array(['اتصال', 'اصبح', 'الانترنت', 'البارحة', 'العصر', 'المطعم', 'الي',
       'انقطع', 'ذهبت', 'ردئ', 'سيئة', 'عني', 'لازال', 'لغة', 'مستواها',
       'منقطع', 'وجبات', 'يقدم'], dtype=object)

In [24]:
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print('------------------------------------------')

print("\n")
print("Prediction")

Cluster 0:
 المطعم
 وجبات
 يقدم
 ردئ
 مستواها
 سيئة
 البارحة
 الي
 ذهبت
 عني
------------------------------------------
Cluster 1:
 الانترنت
 اتصال
 منقطع
 لازال
 عني
 انقطع
 اصبح
 لغة
 العصر
 المطعم
------------------------------------------


Prediction


In [25]:
Y = vectorizer.transform(["وجبات المعطم صارت افضل"])
prediction = model.predict(Y)
print(prediction)

[0]


In [26]:
Y = vectorizer.transform(["ابن عمي ليس لديه اشتراك في الانترنت"])
prediction = model.predict(Y)
print(prediction)


[1]
