#  LLM на CPU проверка моделей


- Проверить загрузку и генерацию 5 небольших моделей на CPU.
-  Провести простую оценку качества (BLEU, ROUGE) на двух коротких примерах.

In [None]:
# Установка зависимостей
!pip install llama-cpp-python transformers evaluate pandas matplotlib seaborn psutil

In [None]:
import time
import pandas as pd
import psutil
from llama_cpp import Llama
from transformers import AutoModelForCausalLM, AutoTokenizer
# Список моделей
models = [
    {'name': 'tinyLLaMA-1B (Q4)', 'loader': 'llama_cpp', 'path': './models/tinyllama-1b-chat.Q4_K_M.gguf'},
    {'name': 'Qwen-1B-small', 'loader': 'transformers', 'model_id': 'Qwen/Qwen-1B-small'},
    {'name': 'GPT2 (124M)', 'loader': 'transformers', 'model_id': 'gpt2'},
    {'name': 'GPT-Neo 125M', 'loader': 'transformers', 'model_id': 'EleutherAI/gpt-neo-125M'},
    {'name': 'Bloom 560M', 'loader': 'transformers', 'model_id': 'bigscience/bloom-560m'}
]

In [None]:
# Замер времени загрузки и генерации
results = []
prompt = 'The quick brown fox jumps over the lazy dog.'
for m in models:
    rec = {'model': m['name'], 'loaded': False, 'load_time': None, 'gen_time': None}
    try:
        t0 = time.time()
        if m['loader'] == 'llama_cpp':
            mdl = Llama(model_path=m['path'], n_threads=4)
            rec['load_time'] = time.time() - t0
            rec['loaded'] = True
            t1 = time.time()
            out = mdl(prompt, max_tokens=32)
            rec['gen_time'] = time.time() - t1
        else:
            tokenizer = AutoTokenizer.from_pretrained(m['model_id'])
            model = AutoModelForCausalLM.from_pretrained(m['model_id']).to('cpu')
            rec['load_time'] = time.time() - t0
            rec['loaded'] = True
            t1 = time.time()
            inputs = tokenizer(prompt, return_tensors='pt')
            gen = model.generate(**inputs, max_new_tokens=32)
            rec['gen_time'] = time.time() - t1
    except Exception as e:
        rec['error'] = str(e)
    results.append(rec)
df = pd.DataFrame(results)
df

In [None]:
# Визуализация load_time и gen_time
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
sns.barplot(data=df, x='model', y='load_time', ax=axes[0])
axes[0].set_title('Load Time (s)')
axes[0].tick_params(axis='x', rotation=45)
sns.barplot(data=df, x='model', y='gen_time', ax=axes[1])
axes[1].set_title('Generation Time (s)')
axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

## Оценка качества генерации
Проводим простую оценку на двух коротких примерах.

In [None]:
import evaluate

examples = [
    ('Translate to French: Hello, how are you?', 'Bonjour, comment ça va ?'),
    ('Summarize: Tiny models enable fast inference.', 'Tiny models allow quick local inference.')
]
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
qual_results = []
for m in models:
    if not m.get('loaded', True):
        continue
    preds = []
    refs = []
    # Получение предсказаний
    if m['loader'] == 'llama_cpp':
        mdl = Llama(model_path=m['path'], n_threads=4)
        for inp, ref in examples:
            out = mdl(inp, max_tokens=64)['choices'][0]['text']
            preds.append(out)
            refs.append([ref])
    else:
        tokenizer = AutoTokenizer.from_pretrained(m['model_id'])
        model = AutoModelForCausalLM.from_pretrained(m['model_id']).to('cpu')
        for inp, ref in examples:
            inpt = tokenizer(inp, return_tensors='pt')
            gen = model.generate(**inpt, max_new_tokens=64)
            out = tokenizer.decode(gen[0], skip_special_tokens=True)
            preds.append(out)
            refs.append([ref])
    # Вычисление метрик
    bleu_score = bleu.compute(predictions=preds, references=refs)['bleu']
    rouge_score = rouge.compute(predictions=preds, references=refs)['rouge']['rougeL']
    qual_results.append({'model': m['name'], 'BLEU': bleu_score, 'ROUGE-L': rouge_score})
df_qual = pd.DataFrame(qual_results)
df_qual

In [None]:
# Визуализация качества
plt.figure(figsize=(8, 4))
sns.barplot(data=df_qual.melt(id_vars='model', var_name='metric', value_name='score'),
            x='model', y='score', hue='metric')
plt.title('Quality Metrics')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()