In [7]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)


cuda


In [8]:
import json
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
import stanza
from dataclasses import dataclass
from typing import List, Dict, Tuple
from interests_embeddings import ie
interest_embeddings = ie

nlp = stanza.Pipeline('ru', processors='tokenize,ner')

2024-12-24 18:13:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 13.6MB/s]                    
2024-12-24 18:13:52 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| ner       | wikiner   |

2024-12-24 18:13:52 INFO: Using device: cuda
2024-12-24 18:13:52 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-12-24 18:13:52 INFO: Loading: ner
2024-12-24 18:13:54 INFO: Done loading processors!


In [15]:
model = SentenceTransformer("DeepPavlov/rubert-base-cased")
#model = SentenceTransformer('sberbank-ai/sbert_large_nlu_ru')


Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s][A
Downloading:   0%|          | 111k/714M [00:00<10:58, 1.08MB/s][A
Downloading:   0%|          | 520k/714M [00:00<04:12, 2.83MB/s][A
Downloading:   0%|          | 1.25M/714M [00:00<02:42, 4.40MB/s][A
Downloading:   0%|          | 2.43M/714M [00:00<02:04, 5.72MB/s][A
Downloading:   1%|          | 4.81M/714M [00:00<01:04, 11.1MB/s][A
Downloading:   1%|          | 6.00M/714M [00:00<01:06, 10.7MB/s][A
Downloading:   1%|          | 7.31M/714M [00:00<01:02, 11.4MB/s][A
Downloading:   1%|          | 8.50M/714M [00:00<01:01, 11.5MB/s][A
Downloading:   1%|▏         | 9.68M/714M [00:01<01:09, 10.1MB/s][A
Downloading:   2%|▏         | 11.3M/714M [00:01<00:59, 11.9MB/s][A
Downloading:   2%|▏         | 12.6M/714M [00:01<00:59, 11.8MB/s][A
Downloading:   2%|▏         | 13.8M/714M [00:01<01:08, 10.3MB/s][A
Downloading:   2%|▏         | 15.5M/714M [00:01<00:58, 12.0MB/s][A
Downloading:   2%|▏         | 16.8M/714M [00:01<00:58, 11.

In [16]:
@dataclass
class User:
    user_id: int
    about_me: str
    selected_interests: List[int]

def get_user_embeddings(user_selected_interests: List[int], embeddings: Dict[str, List[float]]) -> np.ndarray:
    return np.array([embeddings[str(interest_id)] for interest_id in user_selected_interests])

def calculate_interest_similarity(user1_selected_interests: List[int], user2_selected_interests: List[int]) -> float:
    user1_embeddings = get_user_embeddings(user1_selected_interests, interest_embeddings)
    user2_embeddings = get_user_embeddings(user2_selected_interests, interest_embeddings)
    user1_embeddings = normalize(user1_embeddings)
    user2_embeddings = normalize(user2_embeddings)

    similarities = [cosine_similarity([emb1], [emb2])[0][0] for emb1 in user1_embeddings for emb2 in user2_embeddings]
    return np.mean(similarities)

def extract_entities_and_embeddings(text: str) -> Tuple[Dict[str, List[str]], Dict[str, List[np.ndarray]]]:
    doc = nlp(text)
    entities = {ent_type: [] for ent_type in ["PER", "ORG", "LOC", "MISC", "GPE", "FAC", "NORP", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE"]}
    embeddings = {ent_type: [] for ent_type in ["PER", "ORG", "LOC", "MISC", "GPE", "FAC", "NORP", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE"]}

    for sentence in doc.sentences:
        for ent in sentence.ents:
            if ent.type in entities:
                entities[ent.type].append(ent.text)
                entity_embedding = model.encode([ent.text])[0]
                embeddings[ent.type].append(entity_embedding)

    return entities, embeddings

def calculate_description_similarity(user1_about_me: str, user2_about_me: str) -> float:
    entities1, embeddings1 = extract_entities_and_embeddings(user1_about_me)
    entities2, embeddings2 = extract_entities_and_embeddings(user2_about_me)

    all_similarities = []
    for category in entities1:
        category1 = entities1[category]
        embeddings1_category = embeddings1.get(category, [])
        
        category2 = entities2.get(category, [])
        embeddings2_category = embeddings2.get(category, [])

        for ent1, emb1 in zip(category1, embeddings1_category):
            for ent2, emb2 in zip(category2, embeddings2_category):
                similarity = cosine_similarity([emb1], [emb2])[0][0]
                all_similarities.append(similarity)

    return np.mean(all_similarities) if all_similarities else 0.0

def calculate_total_relevance(interest_similarity: float, description_similarity: float) -> float:
    return 0.8 * interest_similarity + 0.2 * description_similarity

def compare_users(user1: User, user2: User) -> float:
    interest_similarity = calculate_interest_similarity(user1.selected_interests, user2.selected_interests)
    description_similarity = calculate_description_similarity(user1.about_me, user2.about_me)
    return calculate_total_relevance(interest_similarity, description_similarity)

def compare_user_with_group(user: User, users: List[User]) -> List[Tuple[int, float]]:
    relevance_scores = []
    for other_user in users:
        if user.user_id != other_user.user_id:
            relevance_score = compare_users(user, other_user)
            relevance_scores.append((other_user.user_id, relevance_score))
    return sorted(relevance_scores, key=lambda x: x[1], reverse=True)


In [17]:
# Пример данных
user1 = User(
    user_id=1,
    about_me="РОССИЯ ЗОВ ГОЙДА",
    selected_interests=["1", "2"]
)

user2 = User(
    user_id=2,
    about_me="ВОДКА БАЛАЛАЙКА",
    selected_interests=["2", "3"]
)

user3 = User(
    user_id=3,
    about_me="Африка негры",
    selected_interests=["1","4"]
)

# Пример списка пользователей
users = [user1, user2, user3]

# Пример вызова функции для сравнения двух пользователей
interest_similarity = calculate_interest_similarity(user1.selected_interests, user2.selected_interests)
print(f"Interest Similarity between User 1 and User 2: {interest_similarity:.4f}")

# Пример вызова функции для сравнения описаний двух пользователей
description_similarity = calculate_description_similarity(user1.about_me, user2.about_me)
print(f"Description Similarity between User 1 and User 2: {description_similarity:.4f}")

# Пример сравнения одного пользователя с группой
relevance_scores = compare_user_with_group(user1, users)
print("\nRelevance Scores for User 1 with others:")
for user_id, score in relevance_scores:
    print(f"User {user_id}: {score:.4f}")

# Пример вычисления общей релевантности для двух пользователей
total_relevance = compare_users(user1, user2)
print(f"\nTotal Relevance between User 1 and User 2: {total_relevance:.4f}")


Interest Similarity between User 1 and User 2: 0.7114
Description Similarity between User 1 and User 2: 0.6511

Relevance Scores for User 1 with others:
User 3: 0.7037
User 2: 0.6994

Total Relevance between User 1 and User 2: 0.6994
