In [23]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
import pickle
import fasttext
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

## Load data and preprocessing

In [25]:
train = pd.read_csv('train.csv', index_col=False)
test = pd.read_csv('test.csv', index_col=False)
name_books = pd.read_csv('books.csv', index_col=False)
medlib_df = pd.read_csv('medlib.csv')

###Fasttext train

In [26]:
fasttext_filename_train = 'unsupervised_fasttext_train.txt'

In [27]:
# Remove users with only 1 book

count_books = medlib_df.groupby('id').count()
delete_users = count_books[count_books.id_book <= 1].index.tolist()
medlib_df = medlib_df[~medlib_df.id.isin(delete_users)]

In [28]:
name_books.fillna('', inplace=True)
name_books = name_books[name_books.id_book != '']
name_books = name_books[name_books.name_book != '']
name_books.name_book = name_books.name_book.apply(lambda x: x.lower())

In [29]:
books = []
for user_id in tqdm(medlib_df.id.unique()):
    book = ''

    for book_id in medlib_df[medlib_df.id == user_id].id_book.values:
        book_name = name_books[name_books.id_book == book_id].name_book.values
        if len(book_name) > 0:
            book += book_name[0].replace(' ', '').replace('[текст]', '').replace(':', '')
            book += ' '
            
    books.append(book)

100%|██████████| 6919/6919 [02:08<00:00, 53.71it/s]


In [30]:
with open(fasttext_filename_train, 'w', encoding="utf-8") as f:
    for line in tqdm(books):
        f.write(line)
        f.write('\n')

100%|██████████| 6919/6919 [00:00<00:00, 57741.91it/s]


In [31]:
model = fasttext.train_unsupervised(fasttext_filename_train,
                                    dim=20)

In [32]:
model.save_model('ftext_unsupervised_20.bin')

## Preprocessing data

In [33]:
name_books['preprocessed'] = name_books.name_book.apply(lambda x: ''.join(filter(str.isalpha, x.replace('[текст]', '').split(':')[0].split('.')[-1].strip())))

In [34]:
test = test.sort_values(by=['id', 'date'])

In [35]:
true_test = test.groupby('id').last().reset_index()
history_test = test[~test.apply(tuple,1).isin(true_test.apply(tuple,1))]

In [36]:
test = test.groupby('id').last().reset_index()

In [37]:
emb_train = []
emb_test = []
users_train = train.id.unique()
for user in tqdm(users_train):
    emb_train.append(pd.merge(train[train.id == user], 
                              name_books[['id_book', 'preprocessed']], 
                              left_on='id_book', right_on='id_book', 
                              how='inner')['preprocessed'].unique().tolist())
  
users_test = test.id.unique()
for user in tqdm(users_test):
    emb_test.append(pd.merge(train[train.id == user], 
                              name_books[['id_book', 'preprocessed']], 
                              left_on='id_book', right_on='id_book', 
                              how='inner')['preprocessed'].unique().tolist())

100%|██████████| 6712/6712 [01:16<00:00, 87.68it/s]
100%|██████████| 1342/1342 [00:15<00:00, 87.98it/s]


In [38]:
model = fasttext.load_model('ftext_unsupervised_20.bin')



In [39]:
%%time
X_train = [model.get_sentence_vector(' '.join(x)) for x in emb_train]
X_test = [model.get_sentence_vector(' '.join(x)) for x in emb_test]

CPU times: user 1.29 s, sys: 75.9 ms, total: 1.36 s
Wall time: 1.24 s


## Clustering on embedding fasttext

In [40]:
def apk(actual, predicted, k=10):
    """
    Average precison at k
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [41]:
def map_cluster(kmeans_df, df, test, history, topn:int=10):
    """
    Mean average precison for cluster
    """
    users = kmeans_df.category_id.unique()
    score = 0

    actuals = []
    preds = []

    for user in users:
        cluster = kmeans_df[kmeans_df.category_id == user]['cluster'].unique()[0]
        similar_users = kmeans_df[kmeans_df.cluster == cluster]['category_id'].unique()
        hist_book = history[history.id == user]['id_book'].unique()
        
        preds.append(df[(df.id.isin(similar_users)) & (df.id != user) & ~(
            df.id_book.isin(hist_book))]['id_book'].value_counts()[:topn].index.tolist())
        actuals.append(test[test.id == user]['id_book'].values.tolist())

    return actuals, preds

In [42]:
def hit_rate_kmean(kmeans_df, df, test, history, topn:int=10):
    """
    Hit rate for K-Means
    """
    users = kmeans_df.category_id.unique()
    score = 0

    for user in users:
        cluster = kmeans_df[kmeans_df.category_id == user]['cluster'].unique()[0]
        similar_users = kmeans_df[kmeans_df.cluster == cluster]['category_id'].unique()
        hist_book = history[history.id == user]['id_book'].unique()

        rec_books = df[(df.id.isin(similar_users)) & (df.id != user) & ~(
            df.id_book.isin(hist_book))]['id_book'].value_counts()[:topn].index.tolist()
        true_book = test[test.id == user]['id_book'].values[0]

        if true_book in rec_books:
            score += 1

    hite_rate = score / users.shape[0]
    return hite_rate

In [43]:
def find_best_k(X_train):
    '''
    Train K-Means with several clusters
    Returns one with best silhouette coefficient
    '''
    metrics = {}
    for i in range(2, 20):
        kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
        kmeans.fit(X_train)

        cluster_labels = kmeans.fit_predict(X_train)
        silhouette_avg = silhouette_score(X_train, cluster_labels)
        
        metrics[i] = silhouette_avg
        print(f'For {i} clusters: metric: {silhouette_avg:.3f}')
        
    best_k = max(metrics, key=metrics.get)
    print(f'\n\tBest number of clusters:{best_k}\n\tDone')
    return best_k

In [44]:
clusters = find_best_k(X_train)

For 2 clusters: metric: 0.485
For 3 clusters: metric: 0.364
For 4 clusters: metric: 0.398
For 5 clusters: metric: 0.376
For 6 clusters: metric: 0.402
For 7 clusters: metric: 0.366
For 8 clusters: metric: 0.299
For 9 clusters: metric: 0.269
For 10 clusters: metric: 0.272
For 11 clusters: metric: 0.239
For 12 clusters: metric: 0.247
For 13 clusters: metric: 0.246
For 14 clusters: metric: 0.247
For 15 clusters: metric: 0.246
For 16 clusters: metric: 0.251
For 17 clusters: metric: 0.250
For 18 clusters: metric: 0.239
For 19 clusters: metric: 0.239

	Best number of clusters:2
	Done


## Save models

In [45]:
with open("clusters.pkl", "wb") as f:
    pickle.dump(clusters, f)