In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib as joblib
import csv
from sklearn.metrics import silhouette_score
from services.database import ClusterFeatureDatabaseService

In [4]:
def dataLoader(path):
    data = pd.read_csv(path,encoding = "ISO-8859-1")
    return data

In [5]:
def describe_cluster(words_list,model):
    labels=model.labels_
    clusters=pd.DataFrame(list(zip(words_list,labels)),columns=['title','cluster'])
    u_labels =  np.array(np.unique(labels), dtype=object)
    pd.options.display.max_rows = 4000   
    for i in u_labels:
      print(i)
      print(clusters.loc[clusters['cluster'] == i])
      print('\n')

In [6]:
def cluster_text(text,k):
    vectorizer = TfidfVectorizer(stop_words={'english'})
    X = vectorizer.fit_transform(text)
    model = KMeans(n_clusters=k, init='k-means++', max_iter=200, n_init=10)
    model.fit(X)
    joblib.dump(model, 'model.pkl')
    score = silhouette_score(X, model.labels_, metric='euclidean')
    print("Silhouette score: {:.2f}".format(score))
    # describe_cluster(text,model)
    
    return model
    

In [7]:
arrayOfWords=[];
def groupDataSet(df):
    df.fillna('', inplace=True)
    df = df.reset_index()  # make sure indexes pair with number of rows
  
    for index, row in tqdm(df.iterrows(),bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}',total=len(df)):
       # add the feature to the array if it has not been added yet case insensitive
         if row['feature'] not in arrayOfWords:
            arrayOfWords.append(row['feature'])
            # 1132
    return cluster_text(arrayOfWords,1132);

In [8]:
def predictCluster(text:str):
    # load the model from disk
    model = joblib.load('model.pkl')
    # make predictions on the text
    vectorizer = TfidfVectorizer(stop_words={'english'})
    # predict clust of text using the model
    X = vectorizer.fit_transform([text])
    return model.predict(X)
    



In [9]:
data=dataLoader("data/final_features.csv")
data.head()
model=groupDataSet(data)
# data=predictCluster("Head")
# print(data)

100%|██████████| 95281/95281 [00:03<00:00, 29360.75it/s]


Silhouette score: 0.10


In [10]:
def write_to_csv(datas, path):
    with open(path, 'w') as f:
        writer = csv.writer(f)
        for i in range(0,len(datas)):
            data=datas[i]
            writer.writerow([i,','.join(data)])

In [11]:
out = []
for i in range(0,max(model.labels_)+1):
    out.append([])

for i in range(0,len(arrayOfWords)):
    out[model.labels_[i]].append(arrayOfWords[i])
for i in range(0,len(out)):
    for word in out[i]:
        ClusterFeatureDatabaseService().put({
            "cluster":i,
            "feature":word,
            },
        )

print(out)

Connected to Database Service
[['hushpuppy'], ['addis ababa bole international airport', 'addis ababa', 'adiss ababa'], ['chicken breast', 'chicken stampi', 'chicken cordon bleu', 'chicken wraps', 'chicken', 'chicken wings', 'chicken legs', 'chicken overload', 'chicken arosto', 'chicken breyani', 'chicken fajita', 'chilli chicken', 'chicken junkie', 'chicken bites', 'butter chicken'], ['staple food'], ['pasta', 'pasta pomodoro', 'meat ball pasta', 'pasta al tuno', 'pasta sampler'], ['coffee cup', 'cup'], ['cheese cake', 'chocolate cake', 'chocolate cheese cake', 'cheese cake baby', 'chocolate chiffon cheese cake'], ['addisabeba', 'ahun addisabeba'], ['icecream', 'icecream ethiopian'], ['cowboy burger'], ['sunset astonishing view', 'sunset view'], ['hawassa myvibr myvibeschallenge', 'myvibeschallenge', 'myvibe myvibeschallenge homemade'], ['highball glass'], ['birthday gift', 'gift'], ['forest', 'green forest'], ['lion', 'masai lion', 'lion of judah'], ['gonder', 'ras gayent gonder', 'g

In [12]:
cfs = ClusterFeatureDatabaseService().get_all()
for cf in cfs:
    print(cf)
    print(cf['cluster'],cf['feature']) 

{'cluster': 166, 'feature': 'hikingadventures greathikersethiopia'}
166 hikingadventures greathikersethiopia
{'cluster': 37, 'feature': 'awhile'}
37 awhile
{'cluster': 330, 'feature': 'ahun myvibe'}
330 ahun myvibe
{'cluster': 580, 'feature': 'nb'}
580 nb
{'cluster': 653, 'feature': 'trianon'}
653 trianon
{'cluster': 365, 'feature': 'test'}
365 test
{'cluster': 37, 'feature': 'falafel'}
37 falafel
{'cluster': 442, 'feature': 'wire fencing'}
442 wire fencing
{'cluster': 464, 'feature': 'clothes hanger'}
464 clothes hanger
{'cluster': 37, 'feature': 'cool'}
37 cool
{'cluster': 649, 'feature': 'video'}
649 video
{'cluster': 37, 'feature': 'keakibrom'}
37 keakibrom
{'cluster': 776, 'feature': 'commercial building'}
776 commercial building
{'cluster': 90, 'feature': 'ani cafe cake'}
90 ani cafe cake
{'cluster': 274, 'feature': 'tuna half veggie'}
274 tuna half veggie
{'cluster': 433, 'feature': 'sun rise'}
433 sun rise
{'cluster': 37, 'feature': 'niko'}
37 niko
{'cluster': 1108, 'feature': 

In [13]:
cfs[0]

{'cluster': 166, 'feature': 'hikingadventures greathikersethiopia'}