In [1]:
%pip install sentence_transformers
%pip install pyngrok
%pip install flask-babel

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

In [2]:
import pandas as pd
import torch
import numpy as np
from flask import Flask, request, jsonify, render_template, send_from_directory
from flask_babel import Babel, _
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (BertTokenizer, BertModel,
                          RobertaTokenizer, RobertaModel,
                          DistilBertTokenizer, DistilBertModel,
                          AlbertTokenizer, AlbertModel,
                          XLNetTokenizer, XLNetModel)
from sentence_transformers import SentenceTransformer
from pyngrok import ngrok
from google.colab import userdata
from google.colab import drive
import re
import matplotlib.pyplot as plt
import seaborn as sns
import time
import tracemalloc

token = userdata.get('auth_token')
!ngrok config add-authtoken $token

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [3]:
drive.mount('/content/drive')
with open('/content/drive/My Drive/bp/themes.json', 'r', encoding='utf-8') as f:
    data = pd.read_json(f)

def strip_html(text):
    if isinstance(text, str):
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)
    return ""

data['Combined_CZ'] = data['Name_CZ'].fillna('').apply(strip_html) + " " + data['Targets_CZ'].fillna('').apply(strip_html)
data['Combined_EN'] = data['Name_EN'].fillna('').apply(strip_html) + " " + data['Targets_EN'].fillna('').apply(strip_html)

Mounted at /content/drive


In [4]:
# Both English and Czech are supported but for English version there is too much sparsity
# And for Czech not all models have their Czech version yet, but if it will be in future, it can be easily added
def load_bert(language='CZ'):
    if language == 'CZ':
        tokenizer = BertTokenizer.from_pretrained('fav-kky/FERNET-C5')
        model = BertModel.from_pretrained('fav-kky/FERNET-C5')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
    return tokenizer, model

def load_sbert():
    return SentenceTransformer('all-mpnet-base-v2')

def load_tfidf(language='CZ'):
    vectorizer = TfidfVectorizer()
    column_name = 'Combined_CZ' if language == 'CZ' else 'Combined_EN'
    vectorizer.fit(data[column_name])
    return vectorizer

def load_roberta(language='CZ'):
    if language == 'CZ':
        tokenizer = RobertaTokenizer.from_pretrained('fav-kky/FERNET-C5-RoBERTa')
        model = RobertaModel.from_pretrained('fav-kky/FERNET-C5-RoBERTa')
    else:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        model = RobertaModel.from_pretrained('roberta-base')
    return tokenizer, model

def load_distilbert(language='CZ'):
    if language == 'CZ':
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
        model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
    else:
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        model = DistilBertModel.from_pretrained('distilbert-base-uncased')
    return tokenizer, model

def load_albert(language='CZ'):
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    model = AlbertModel.from_pretrained('albert-base-v2')
    return tokenizer, model

def load_xlnet(language='CZ'):
    if language == 'CZ':
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetModel.from_pretrained('xlnet-base-cased')
    else:
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetModel.from_pretrained('xlnet-base-cased')
    return tokenizer, model

def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def get_sbert_embedding(text, model):
    return model.encode(text)

def get_tfidf_embedding(text, vectorizer):
    return vectorizer.transform([text]).toarray()[0]

def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def get_distilbert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def get_albert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def get_xlnet_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def precompute_embeddings(model_name, language):
    column_name = 'Combined_CZ' if language == 'CZ' else 'Combined_EN'
    embeddings = []
    if model_name == 'bert':
        tokenizer, model = load_bert()
        for theme in data[column_name]:
            embedding = get_bert_embedding(theme, tokenizer, model)
            embeddings.append(torch.tensor(embedding))
    elif model_name == 'sbert':
        model = load_sbert()
        for theme in data[column_name]:
            embedding = get_sbert_embedding(theme, model)
            embeddings.append(torch.tensor(embedding))
    elif model_name == 'tfidf':
        vectorizer = load_tfidf()
        vectorizer.fit(data[column_name])
        for theme in data[column_name]:
            embedding = get_tfidf_embedding(theme, vectorizer)
            embeddings.append(torch.tensor(embedding))
    elif model_name == 'roberta':
        tokenizer, model = load_roberta()
        for theme in data[column_name]:
            embedding = get_roberta_embedding(theme, tokenizer, model)
            embeddings.append(torch.tensor(embedding))
    elif model_name == 'distilbert':
        tokenizer, model = load_distilbert()
        for theme in data[column_name]:
            embedding = get_distilbert_embedding(theme, tokenizer, model)
            embeddings.append(torch.tensor(embedding))
    elif model_name == 'albert':
        tokenizer, model = load_albert()
        for theme in data[column_name]:
            embedding = get_albert_embedding(theme, tokenizer, model)
            embeddings.append(torch.tensor(embedding))
    elif model_name == 'xlnet':
        tokenizer, model = load_xlnet()
        for theme in data[column_name]:
            embedding = get_xlnet_embedding(theme, tokenizer, model)
            embeddings.append(torch.tensor(embedding))
    return torch.stack(embeddings)


In [5]:
# Define control flags
COMPUTE_AND_SAVE = True
LOAD_EMBEDDINGS = not COMPUTE_AND_SAVE

# Define model names and their corresponding save paths for both languages
models_list = ['tfidf', 'bert', 'sbert', 'roberta', 'distilbert', 'albert', 'xlnet']
embedding_paths_CZ = {model: f'/content/drive/My Drive/bp/thesis_embeddings_{model}_CZ.pt' for model in models_list}
#embedding_paths_EN = {model: f'/content/drive/My Drive/bp/thesis_embeddings_{model}_EN.pt' for model in models_list}

# Dictionary to hold loaded or computed embeddings for each language
thesis_embeddings_CZ = {}
#thesis_embeddings_EN = {}

# English version has too many nulls for themes and targets

# If computing and saving is enabled
if COMPUTE_AND_SAVE:
    for model in models_list:
        print(f"\nProcessing {model} model embeddings...")

        start_time = time.time()
        tracemalloc.start()

        # Compute embeddings for Czech and save
        thesis_embeddings_CZ[model] = precompute_embeddings(model, language='CZ')
        torch.save(thesis_embeddings_CZ[model], embedding_paths_CZ[model])

        current, peak = tracemalloc.get_traced_memory()
        end_time = time.time()

        print(f"Time taken for {model} (CZ): {end_time - start_time:.2f} seconds")
        print(f"Peak memory usage for {model} (CZ): {peak / 1024 / 1024:.2f} MB")

        #start_time = time.time()
        #tracemalloc.start()
#
        ## Compute embeddings for English and save
        #thesis_embeddings_EN[model] = precompute_embeddings(model, language='EN')
        #torch.save(thesis_embeddings_EN[model], embedding_paths_EN[model])
#
        #current, peak = tracemalloc.get_traced_memory()
        #end_time = time.time()
#
        #print(f"Time taken for {model} (EN): {end_time - start_time:.2f} seconds")
        #print(f"Peak memory usage for {model} (EN): {peak / 1024 / 1024:.2f} MB")
#
        ## Stop tracking memory
        #tracemalloc.stop()


# If loading embeddings is enabled
if LOAD_EMBEDDINGS:
    for model in models_list:
        print(f"\nLoading embeddings for {model}...")

        start_time = time.time()
        tracemalloc.start()

        # Load embeddings for Czech
        try:
            thesis_embeddings_CZ[model] = torch.load(embedding_paths_CZ[model])
        except FileNotFoundError:
            print(f"Czech embedding file for {model} not found.")

        current, peak = tracemalloc.get_traced_memory()
        end_time = time.time()
        print(f"Time taken to load {model} (CZ): {end_time - start_time:.2f} seconds")
        print(f"Peak memory usage for {model} (CZ): {peak / 1024 / 1024:.2f} MB")

        #start_time = time.time()
        #tracemalloc.start()


        # Load embeddings for English
        #try:
        #    thesis_embeddings_EN[model] = torch.load(embedding_paths_EN[model])
        #except FileNotFoundError:
        #    print(f"English embedding file for {model} not found.")

        #current, peak = tracemalloc.get_traced_memory()
        #end_time = time.time()

        #print(f"Time taken to load {model} (EN): {end_time - start_time:.2f} seconds")
        #print(f"Peak memory usage for {model} (EN): {peak / 1024 / 1024:.2f} MB")

        # Stop tracking memory
        #tracemalloc.stop()



# Load models conditionally if embeddings are loaded successfully
if thesis_embeddings_CZ: #and thesis_embeddings_EN
    models = {
        "bert": load_bert(),
        "sbert": load_sbert(),
        "tfidf": load_tfidf(),
        "roberta": load_roberta(),
        "distilbert": load_distilbert(),
        "albert": load_albert(),
        "xlnet": load_xlnet()
    }

# Combine embeddings into a final dictionary for each language
embeddings_dict_CZ = {model: thesis_embeddings_CZ[model] for model in models_list if model in thesis_embeddings_CZ}
# embeddings_dict_EN = {model: thesis_embeddings_EN[model] for model in models_list if model in thesis_embeddings_EN}



Processing tfidf model embeddings...
Time taken for tfidf (CZ): 18.09 seconds
Peak memory usage for tfidf (CZ): 6.12 MB

Processing bert model embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/874k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/566 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

Time taken for bert (CZ): 930.13 seconds
Peak memory usage for bert (CZ): 59.11 MB

Processing sbert model embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Time taken for sbert (CZ): 1552.77 seconds
Peak memory usage for sbert (CZ): 82.39 MB

Processing roberta model embeddings...


tokenizer_config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/932k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/592k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at fav-kky/FERNET-C5-RoBERTa and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Time taken for roberta (CZ): 1064.12 seconds
Peak memory usage for roberta (CZ): 106.79 MB

Processing distilbert model embeddings...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Time taken for distilbert (CZ): 638.23 seconds
Peak memory usage for distilbert (CZ): 116.34 MB

Processing albert model embeddings...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Time taken for albert (CZ): 1478.65 seconds
Peak memory usage for albert (CZ): 116.34 MB

Processing xlnet model embeddings...


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

Time taken for xlnet (CZ): 2323.52 seconds
Peak memory usage for xlnet (CZ): 116.34 MB


Some weights of RobertaModel were not initialized from the model checkpoint at fav-kky/FERNET-C5-RoBERTa and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
app = Flask(__name__, template_folder='/content/drive/My Drive/bp/templates',
                      static_folder='/content/drive/My Drive/bp/static')
babel = Babel(app)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/recommendations', methods=['POST'])
def get_recommendations():
    try:
        keywords = request.form.get('keywords')
        approach = request.form.get('approach')
        work_type = request.form.get('workType')
        language = request.form.get('language', 'CZ')  # Default to Czech
        offset = int(request.form.get('offset', 0))

        if not keywords or not approach or not work_type:
            return jsonify({'error': 'Keywords, approach, and work type are required.'}), 400

        #column_name = 'Combined_CZ' if language == 'CZ' else 'Combined_EN'
        data_column = data['Combined_CZ']

        embedding_key = approach

        # Select the correct embeddings dictionary based on language
        thesis_embeddings = embeddings_dict_CZ.get(embedding_key) # Add check for language for english if needed

        if thesis_embeddings is None:
            print(f"Error: Embeddings for '{embedding_key}' not found in {'CZ' if language == 'CZ' else 'EN'} dictionary.")
            return jsonify({'error': f'Embeddings for "{embedding_key}" not found. Please check your embeddings files.'}), 400

        if approach in models:
            model_entry = models[approach]
            if approach == "tfidf":
                vectorizer = model_entry
                vectorizer.fit(data_column)
                user_embedding = get_tfidf_embedding(keywords, vectorizer)
            elif approach == "sbert":
                model = model_entry
                user_embedding = get_sbert_embedding(keywords, model)
            else:
                tokenizer, model = model_entry
                user_embedding = globals()[f'get_{approach}_embedding'](keywords, tokenizer, model)
        else:
            return jsonify({'error': 'Unknown approach selected.'}), 400

        # Convert user_embedding to numpy array if necessary
        if not isinstance(user_embedding, np.ndarray):
            user_embedding = np.array(user_embedding)

        similarities = cosine_similarity([user_embedding], thesis_embeddings)
        top_indices = similarities.argsort()[0][::-1]
        unique_themes = set()
        recommendations = []

        for index in top_indices:
            theme_name = data.iloc[index][f'Name_{language}']
            theme_work_type = data.iloc[index]['Type of Work']

            if theme_name not in unique_themes and theme_work_type == work_type:
                unique_themes.add(theme_name)
                recommendations.append({
                    f'Name_{language}': theme_name,
                    'Supervisor': data.iloc[index]['Supervisor'],
                    f'Targets_{language}': data.iloc[index][f'Targets_{language}']
                })

        paginated_recommendations = recommendations[offset:offset + 5]

        return jsonify(paginated_recommendations)

    except Exception as e:
        print(f"Error: {e}")
        return jsonify({'error': str(e)}), 500


if __name__ == '__main__':
    ngrok_tunnel = ngrok.connect(5000)
    print(f'Public URL: {ngrok_tunnel.public_url}')
    app.run(port=5000)


Public URL: https://5d65-34-125-25-225.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:20] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:22] "GET /static/flags/en.png HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:22] "GET /static/styles.css HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:22] "GET /static/flags/ukr.png HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:22] "GET /static/app.js HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:22] "GET /static/flags/cz.png HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:23] "GET /static/favicon.ico HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:23] "GET /static/langs/en.json HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:27] "POST /recommendations HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [20/Feb/2025 14:32:39] "POST /recommendations HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.

In [None]:
def test_models_against_keywords(keywords_list, models, work_type, language='CZ'):
    results = {}

    for keywords in keywords_list:
        results[keywords] = {}

        for model_name, model in models.items():
            with app.test_request_context('/recommendations', method='POST', data={
                'keywords': keywords,
                'approach': model_name,
                'workType': work_type,
                'language': language,
                'offset': 0
            }):
                try:
                    response = get_recommendations()
                    recommendations = response.get_json()
                    results[keywords][model_name] = recommendations
                except Exception as e:
                    print(f"Error fetching recommendations for {keywords} with {model_name}: {e}")
                    results[keywords][model_name] = []

    return results


def comparative_analysis(results, language='CZ'):
    analysis = {}
    total_common_counts = {model: 0 for model in results[list(results.keys())[0]]}
    name_key = 'Name_EN' if language == 'EN' else 'Name_CZ'
    print(name_key)

    for keywords, model_results in results.items():
        analysis[keywords] = {}
        recommendations = {model_name: set() for model_name in model_results.keys()}

        # Collect recommendations for each model
        for model_name, recommendations_list in model_results.items():
            recommendations[model_name] = {
                rec.get(name_key, '') for rec in recommendations_list if name_key in rec
            }

        # Count recommendations and store unique sets
        for model_name, rec_set in recommendations.items():
            analysis[keywords][model_name] = {
                'num_recommendations': len(rec_set),
                'unique_recommendations': rec_set
            }

        # Compare recommendations between models, including Jaccard Similarity
        for model1 in recommendations.keys():
            for model2 in recommendations.keys():
                if model1 != model2:
                    intersection = recommendations[model1].intersection(recommendations[model2])
                    union = recommendations[model1].union(recommendations[model2])
                    num_common = len(intersection)
                    jaccard_similarity = num_common / len(union) if union else 0  # Handle empty sets

                    analysis[keywords][f'{model1} vs {model2}'] = {
                        'common_recommendations': intersection,
                        'num_common': num_common,
                        'jaccard_similarity': jaccard_similarity
                    }

                    # Sum the number of common recommendations for each model
                    total_common_counts[model1] += num_common
                    total_common_counts[model2] += num_common

    return analysis, total_common_counts


def visualize_comparisons(comparison_results):
    # Get all unique model names
    model_names = sorted(set(
        key for keyword in comparison_results.keys()
        for key in comparison_results[keyword].keys()
        if 'vs' not in key
    ))

    num_models = len(model_names)

    # Create a DataFrame to store common recommendations
    heatmap_data = pd.DataFrame(np.zeros((num_models, num_models)), index=model_names, columns=model_names)

    # Fill the DataFrame with the number of common recommendations
    for keyword in comparison_results.keys():
        for model1 in model_names:
            for model2 in model_names:
                if model1 != model2:  # Only compare different models
                    common_count = comparison_results[keyword].get(f'{model1} vs {model2}', {}).get('num_common', 0)
                    heatmap_data.loc[model1, model2] += common_count

                # Keep diagonal as zero (but will be painted gray)
                if model1 == model2:
                    heatmap_data.loc[model1, model2] = np.nan  # Use NaN for diagonal

    # Create a heatmap
    plt.figure(figsize=(10, 8))
    # Mask diagonal
    mask = np.eye(num_models, dtype=bool)

    sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="YlGnBu", cbar_kws={'label': 'Number of Common Recommendations'}, mask=mask)

    # Paint diagonal in gray
    for i in range(num_models):
        plt.gca().add_patch(plt.Rectangle((i, i), 1, 1, color='gray', alpha=0.5))

    plt.title('Heatmap of Common Recommendations Between Models')
    plt.xlabel('Models')
    plt.ylabel('Models')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

def plot_total_common_recommendations(total_common_counts):
    model_names = list(total_common_counts.keys())
    common_counts = list(total_common_counts.values())

    plt.figure(figsize=(10, 6))
    plt.bar(model_names, common_counts, color='skyblue')
    plt.xlabel('Models')
    plt.ylabel('Total Common Recommendations')
    plt.title('Total Number of Common Recommendations for Each Model')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def visualize_jaccard_similarity(analysis):
    # Extract unique model names
    model_names = sorted(set(
        key.split(" vs ")[0] for keyword in analysis.keys()
        for key in analysis[keyword].keys()
        if 'vs' in key
    ))

    # Initialize DataFrame to store average Jaccard Similarity between models
    jaccard_matrix = pd.DataFrame(np.zeros((len(model_names), len(model_names))), index=model_names, columns=model_names)

    # Fill in the Jaccard Similarity scores
    for keyword, comparisons in analysis.items():
        for model1 in model_names:
            for model2 in model_names:
                if model1 != model2:
                    # Get Jaccard Similarity for this model pair for the current keyword
                    jaccard_score = comparisons.get(f"{model1} vs {model2}", {}).get("jaccard_similarity", 0)

                    # Sum up scores across keywords for average calculation
                    jaccard_matrix.loc[model1, model2] += jaccard_score

    # Calculate the average Jaccard Similarity by dividing by the number of keywords
    jaccard_matrix /= len(analysis)

    # Create a mask to hide the diagonal and fill it with NaN for styling
    mask = np.eye(len(model_names), dtype=bool)
    jaccard_matrix[mask] = np.nan

    # Plot heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(jaccard_matrix, annot=True, fmt=".2f", cmap="YlGnBu", mask=mask, cbar_kws={'label': 'Jaccard Similarity'})
    plt.title('Jaccard Similarity Heatmap Between Models')
    plt.xlabel('Models')
    plt.ylabel('Models')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:

# keywords to test
keywords_list = [
    'aplikace site grafika',
    'analýza dat',
    'strojové učení',
    'umělá inteligence',
    'webové aplikace',
    'big data analýza',
    'internet věcí',
    'zpracování přirozeného jazyka',
    'počítačové vidění',
    'mobilní aplikace',
    'cloud computing',
    'datové sklady',
    'podniková analytika',
    'systémy pro správu obsahu',
    'uživatelské rozhraní a zkušenost',
    'umělá neuronová síť',
    'senzorové sítě',
    'prediktivní analýza',
    'robotika a automatizace',
    'kybernetická bezpečnost',
    'strojové vidění',
    'vzdělávací technologie',
    'zpracování obrazu',
    'virtuální realita',
    'rozšířená realita',
    'programování v Pythonu',
    'programování v Javě',
    'systémy řízení databází',
    'datové modelování',
    'analýza sentimentu',
    'řídicí systémy',
    'internetové protokoly',
    'softwarové inženýrství',
    'případové studie',
    'mobilní technologie',
    'digitalizace procesů',
    'správa projektů',
    'agilní metodologie',
    'kognitivní technologie',
    'udržitelný rozvoj technologií',
    'umělé neuronové sítě v medicíně',
    'digitální transformace',
    'technologie blockchain',
    'analýza sociálních médií',
    'analýza obrazových dat pomocí strojového učení',
    'aplikace rozšířené reality pro vzdělávání',
    'využití umělé inteligence v diagnostice nemocí',
    'optimalizace dodavatelského řetězce pomocí datové analytiky',
    'vývoj aplikací pro automatizaci obchodních procesů',
    'zpracování dat pomocí Pythonu a knihovny Pandas',
    'nové trendy v oblasti digitálního marketingu',
    'aplikace umělé inteligence v osobních asistentech',
    'využití datových modelů pro predikci chování zákazníků',
    'implementace cloudových řešení pro podnikové systémy',
    'analyzování trendů v oblasti kybernetické bezpečnosti',
    'aplikace strojového učení v personalizovaném marketingu',
    'rozhodovací stromy v analýze dat',
    'transformace podnikových procesů pomocí digitálních technologií',
    'aplikace IoT v monitorování a řízení spotřeby energie',
    'analýza dat z mobilních aplikací pro zlepšení uživatelské zkušenosti',
    'datová analytika pro optimalizaci cenových strategií',
    'případové studie úspěšného využití technologií v byznysu',
    'zpracování dat v reálném čase pro včasné rozhodování',
    'vývoj herních aplikací a využití herních technologií',
    'výzkum v oblasti biologických dat a bioinformatiky',
    'využití technologií umělé inteligence ve finančních službách',
    'analyzování efektivity reklamních kampaní pomocí datových analýz',
    'vytváření dynamických webových stránek s JavaScriptem',
    'design a implementace databázových systémů',
    'testování a validace softwarových aplikací',
    'mobilní technologie a vývoj aplikací pro chytré telefony',
    'vytváření uživatelsky přívětivých rozhraní pro webové aplikace',
    'aplikace analýzy sentimentu v oblasti zákaznického servisu',
    'nové přístupy k vývoji softwaru s využitím metodiky DevOps',
    'aplikace umělé inteligence v automobilovém průmyslu',
    'správa a analýza metadat pro efektivní správu informací',
    'digitalizace procesů v oblasti zdravotnictví',
    'použití datové analytiky při rozhodování o investicích'
]

work_type = 'bakalářská práce'

# Run the keyword testing
results = test_models_against_keywords(keywords_list, models, work_type)

# Perform comparative analysis and get total common counts
comparison_results, total_common_counts = comparative_analysis(results)

# Visualize comparisons
visualize_comparisons(comparison_results)

# Plot the total common recommendations for each model
plot_total_common_recommendations(total_common_counts)

visualize_jaccard_similarity(comparison_results)

In [None]:
# keywords in English for testing
keywords_list_en = [
    'application network graphics',
    'data analysis',
    'machine learning',    'artificial intelligence',
    'web applications',
    'big data analysis',
    'internet of things',
    'natural language processing',
    'computer vision',
    'mobile applications',
    'cloud computing',
    'data warehouses',
    'enterprise analytics',
    'content management systems',
    'user interface and experience',
    'artificial neural network',
    'sensor networks',
    'predictive analysis',
    'robotics and automation',
    'cybersecurity',
    'machine vision',
    'educational technology',
    'image processing',
    'virtual reality',
    'augmented reality',
    'programming in Python',
    'programming in Java',
    'database management systems',
    'data modeling',
    'sentiment analysis',
    'control systems',
    'internet protocols',
    'software engineering',
    'case studies',
    'mobile technology',
    'process digitization',
    'project management',
    'agile methodologies',
    'cognitive technologies',
    'sustainable technology development',
    'artificial neural networks in medicine',
    'digital transformation',
    'blockchain technology',
    'social media analysis',
    'image data analysis using machine learning',
    'augmented reality applications for education',
    'use of AI in disease diagnosis',
    'supply chain optimization using data analytics',
    'business process automation application development',
    'data processing with Python and Pandas',
    'new trends in digital marketing',
    'AI applications in personal assistants',
    'data models for customer behavior prediction',
    'cloud solutions implementation for enterprise systems',
    'cybersecurity trend analysis',
    'machine learning in personalized marketing',
    'decision trees in data analysis',
    'business process transformation with digital technologies',
    'IoT in energy consumption monitoring and management',
    'data analysis from mobile applications to improve user experience',
    'data analytics for pricing strategy optimization',
    'case studies of successful technology use in business',
    'real-time data processing for timely decision-making',
    'game application development and gaming technology use',
    'biological data research and bioinformatics',
    'AI technology in financial services',
    'advertising campaign efficiency analysis using data analytics',
    'creating dynamic web pages with JavaScript',
    'database system design and implementation',
    'software application testing and validation',
    'mobile technology and smart device application development',
    'creating user-friendly interfaces for web applications',
    'sentiment analysis application in customer service',
    'new software development approaches using DevOps methodology',
    'AI applications in the automotive industry',
    'metadata management and analysis for efficient information management',
    'healthcare process digitization',
    'data analytics in investment decision-making'
]

# Set the work type for the tests
work_type = 'bakalářská práce'

# Run the keyword testing for English language
results_en = test_models_against_keywords(keywords_list_en, models, work_type, language='EN')

# Perform comparative analysis and get total common counts
comparison_results_en, total_common_counts_en = comparative_analysis(results_en,language='EN')

# Visualize comparisons
visualize_comparisons(comparison_results_en)

# Plot the total common recommendations for each model
plot_total_common_recommendations(total_common_counts_en)

visualize_jaccard_similarity(comparison_results_en)