In [50]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [51]:
import pickle
import fasttext
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## load data

In [52]:
train = pd.read_csv('train.csv', index_col=False)
test = pd.read_csv('test.csv', index_col=False)
name_books = pd.read_csv('books.csv', index_col=False)
medlib_df = pd.read_csv('medlib.csv')

###Fasttext train

In [53]:
fasttext_filename_train = 'unsupervised_fasttext_train.txt'

In [54]:
# Remove users with only 1 book

count_books = medlib_df.groupby('id').count()
delete_users = count_books[count_books.id_book <= 1].index.tolist()
medlib_df = medlib_df[~medlib_df.id.isin(delete_users)]

In [55]:
name_books.fillna('', inplace=True)
name_books = name_books[name_books.id_book != '']
name_books = name_books[name_books.name_book != '']
name_books.name_book = name_books.name_book.apply(lambda x: x.lower())

In [56]:
books = []
for user_id in tqdm(medlib_df.id.unique()):
    book = ''
    for book_id in medlib_df[medlib_df.id == user_id].id_book.values:
        book_name = name_books[name_books.id_book == book_id].name_book.values
        if len(book_name) > 0:
            book += book_name[0].replace(' ', '').replace('[текст]', '').replace(':', '')
            book += ' '
    books.append(book)

100%|██████████| 6919/6919 [02:23<00:00, 48.27it/s]


In [57]:
with open(fasttext_filename_train, 'w', encoding="utf-8") as f:
    for line in tqdm(books):
        f.write(line)
        f.write('\n')

100%|██████████| 6919/6919 [00:00<00:00, 1028471.82it/s]


In [58]:
model = fasttext.train_unsupervised(fasttext_filename_train,
                                    dim=20)

In [59]:
model.save_model('ftext_unsupervised_20.bin')

## Preprocessing

In [60]:
name_books['preprocessed'] = name_books.name_book.apply(lambda x: ''.join(filter(str.isalpha, x.replace('[текст]', '').split(':')[0].split('.')[-1].strip())))

In [61]:
test = test.sort_values(by=['id', 'date'])

In [62]:
true_test = test.groupby('id').last().reset_index()
history_test = test[~test.apply(tuple,1).isin(true_test.apply(tuple,1))]

In [63]:
test = test.groupby('id').last().reset_index()

In [64]:
%%time
emb_train = []
users = train.id.unique()
for user in users:
    emb_train.append(pd.merge(train[train.id == user], 
                              name_books[['id_book', 'preprocessed']], 
                              left_on='id_book', right_on='id_book', 
                              how='inner')['preprocessed'].unique().tolist())

CPU times: user 1min 22s, sys: 223 ms, total: 1min 22s
Wall time: 1min 28s


In [65]:
%%time
emb_test = []
users = test.id.unique()
for user in users:
    emb_test.append(pd.merge(test[test.id == user], 
                             name_books[['id_book', 'preprocessed']], 
                              left_on='id_book', right_on='id_book', 
                             how='inner')['preprocessed'].unique().tolist())

CPU times: user 4.4 s, sys: 21.9 ms, total: 4.42 s
Wall time: 4.43 s


In [66]:
%%time
emb_hist = []
users = test.id.unique()
for user in users:
    emb_hist.append(pd.merge(history_test[history_test.id == user], 
                             name_books[['id_book', 'preprocessed']], 
                             left_on='id_book', right_on='id_book', 
                             how='inner')['preprocessed'].unique().tolist())

CPU times: user 4 s, sys: 15.8 ms, total: 4.02 s
Wall time: 4 s


In [67]:
model = fasttext.load_model('ftext_unsupervised_20.bin')



In [68]:
%%time
X_train = [model.get_sentence_vector(' '.join(x)) for x in emb_train]
X_test = [model.get_sentence_vector(' '.join(x)) for x in emb_test]
X_hist = [model.get_sentence_vector(' '.join(x)) for x in emb_hist]

CPU times: user 1.11 s, sys: 75.7 ms, total: 1.19 s
Wall time: 1.11 s


## Clustering on embedding fasttext

In [70]:
def apk(actual, predicted, k=10):
    if len(predicted)>k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    return score

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [71]:
def map_cluster(kmeans_df, df, test, history, topn:int=10):
    users = kmeans_df.category_id.unique()
    score = 0
    actuals = []
    preds = []
    for user in users:
        cluster = kmeans_df[kmeans_df.category_id == user]['cluster'].unique()[0]
        similar_users = kmeans_df[kmeans_df.cluster == cluster]['category_id'].unique()
        hist_book = history[history.id == user]['id_book'].unique()
        
        preds.append(df[(df.id.isin(similar_users)) & (df.id != user) & ~(
            df.id_book.isin(hist_book))]['id_book'].value_counts()[:topn].index.tolist())
        actuals.append(test[test.id == user]['id_book'].values.tolist())

    return actuals, preds

In [78]:
def hit_rate_kmean(kmeans_df, df, test, history, topn:int=10):
    users = kmeans_df.category_id.unique()
    score = 0
    for user in users:
        cluster = kmeans_df[kmeans_df.category_id == user]['cluster'].unique()[0]
        similar_users = kmeans_df[kmeans_df.cluster == cluster]['category_id'].unique()
        hist_book = history[history.id == user]['id_book'].unique()
        rec_books = df[(df.id.isin(similar_users)) & (df.id != user) & ~(
            df.id_book.isin(hist_book))]['id_book'].value_counts()[:topn].index.tolist()
        true_book = test[test.id == user]['id_book'].values[0]
        if true_book in rec_books:
            score += 1
    hite_rate = score / users.shape[0]
    return hite_rate

In [73]:
def find_best_k(X_train):
    '''
    обучаем kmeans с разным числом кластеров возвраем с наибольшим значение коэфициента силуэта
    '''
    metrics = {}
    for i in range(2, 20):
        kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
        kmeans.fit(X_train)
        cluster_labels = kmeans.fit_predict(X_train)
        silhouette_avg = silhouette_score(X_train, cluster_labels)
        metrics[i] = silhouette_avg
        print(f'For {i} clusters: metric: {silhouette_avg:.3f}')
    best_k = max(metrics, key=metrics.get)
    print(f'\n\tBest number of clusters:{best_k}\n\tDone')
    return best_k

In [74]:
%%time
clusters = find_best_k(X_train)

For 2 clusters: metric: 0.257
For 3 clusters: metric: 0.245
For 4 clusters: metric: 0.128
For 5 clusters: metric: 0.134
For 6 clusters: metric: 0.106
For 7 clusters: metric: 0.111
For 8 clusters: metric: 0.111
For 9 clusters: metric: 0.097
For 10 clusters: metric: 0.052
For 11 clusters: metric: 0.054
For 12 clusters: metric: 0.097
For 13 clusters: metric: 0.089
For 14 clusters: metric: 0.053
For 15 clusters: metric: 0.082
For 16 clusters: metric: 0.052
For 17 clusters: metric: 0.053
For 18 clusters: metric: 0.058
For 19 clusters: metric: 0.056

	Best number of clusters:2
	Done
CPU times: user 1min 18s, sys: 26.4 s, total: 1min 44s
Wall time: 1min 4s


---

## Save models

In [80]:
with open("kmeans.pkl", "wb") as f:
    pickle.dump(y_km, f)

---