In [1]:
import os
import re
import json
import logging
import asyncio
import requests
import subprocess
import textstat
import bert_score
import pandas as pd
import seaborn as sns
from pathlib import Path
from rouge import Rouge
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu


from llm_parser import DeepSeekAPI
from io_functions import run, popen, load_if_scraped

In [3]:
logging.basicConfig(level=logging.DEBUG, force=True)

In [4]:
ollama_server_process = popen('ollama serve')

ollama_server_process

<Popen: returncode: None args: ['ollama', 'serve']>

In [5]:
def get_ollama_version():
    try:
        result = subprocess.run(
            ['ollama', '--version'], 
            check=True, 
            stdout=subprocess.PIPE, 
            stderr=subprocess.PIPE, 
            text=True
        )
        return result.stdout.strip()
    except subprocess.CalledProcessError:
        return 'Error while checking Ollama version.'

get_ollama_version()

'ollama version is 0.5.13'

In [6]:
process = run('ollama list')

print(process.stdout)

NAME               ID              SIZE      MODIFIED    
deepseek-r1:32b    38056bbcbb2d    19 GB     2 hours ago    
deepseek-r1:8b     28f8fd6cdc67    4.9 GB    4 days ago     



In [7]:
models = [
    line.strip() for line in process.stdout.split('\n')
        if line != ''
][1:]

model_list = [model.split()[0] for model in models]
model_list

['deepseek-r1:32b', 'deepseek-r1:8b']

To prevent running a non-existing model:

In [8]:
model_name = model_list[0]
model_name

'deepseek-r1:32b'

In [9]:
# model_name = 'deepseek-r1:32b'

In [10]:
if model_name not in model_list:
    run(f'ollama pull {model_name}')

In [11]:
print(run(f'ollama show {model_name}').stdout)

  Model
    architecture        qwen2     
    parameters          32.8B     
    context length      131072    
    embedding length    5120      
    quantization        Q4_K_M    

  Parameters
    stop    "<｜begin▁of▁sentence｜>"    
    stop    "<｜end▁of▁sentence｜>"      
    stop    "<｜User｜>"                 
    stop    "<｜Assistant｜>"            

  License
    MIT License                    
    Copyright (c) 2023 DeepSeek    




In [12]:
def extract_model_info(text: str):
    embedding_match = re.search(r'embedding length\s+(\d+)', text)
    context_match = re.search(r'context length\s+(\d+)', text)
    
    embedding_length = int(embedding_match.group(1)) if embedding_match else None
    context_length = int(context_match.group(1)) if context_match else None
    
    return {
        'embedding_length': embedding_length,
        'context_length': context_length
    }

model_info = extract_model_info(run(f'ollama show {model_name}').stdout)
model_info

{'embedding_length': 5120, 'context_length': 131072}

In [13]:
model_process = popen(f'ollama run {model_name}')

In [14]:
print(run('ollama ps').stdout)

NAME    ID    SIZE    PROCESSOR    UNTIL 



In [15]:
company_id = '312932093'

In [16]:
df = load_if_scraped(company_id=company_id)

df.head()

INFO:root:successfully loaded local transcripts


Unnamed: 0,companyid,companyname,mostimportantdateutc,mostimportanttimeutc,headline,full_text,uuid,word_count,word_count_nltk
0,312932093,Google LLC,2018-05-10,15:30:00,Google LLC Presents at The 14th annual Red Hat...,Attendees: Now if there's a company that under...,123c9e75-b8dc-40b1-b21c-ffc9e71e01c6,12407,14475
1,312932093,Google LLC,2023-06-15,21:00:00,"Google LLC, Squarespace, Inc. - M&A Call","Operator: Good afternoon. My name is Sara, and...",123c9e75-b8dc-40b1-b21c-ffc9e71e01c6,10078,11800


In [17]:
api = DeepSeekAPI(
    model_name=model_name
)

print(api)

DeepSeekAPI(model_name=deepseek-r1:32b, url=http://localhost:11434/api/generate, stream=False, timeout=30, max_retries=3)


In [18]:
import nest_asyncio
import asyncio
from tqdm.notebook import tqdm

nest_asyncio.apply()

async def batch_summary_generation(api: DeepSeekAPI, texts) -> list[str]:
    summaries = []
    for text in tqdm(texts):
        prompt = 'summarize: ' + text
        summary = await api.generate(prompt=prompt)
        summaries.append(summary)
    return summaries

# Now you can use await directly
summaries = await batch_summary_generation(api, df.full_text)

  0%|          | 0/2 [00:00<?, ?it/s]

AttributeError: type object 'DeepSeekAPI' has no attribute 'model_name'

In [None]:
summaries[0][:50]

In [None]:
summary_column = f'{model_name}-summaries'
df[summary_column] = summaries

In [None]:
summaries

In [None]:
def evaluate_summary(row, model_name, type = 'baseline'):
    text_to_summarize = row.full_text
    summary = row[summary_column]
    uuid = row.uuid
    company_id = row.companyid
    company_name = row.companyname

    rouge_evaluator = Rouge()
    rouge_scores = rouge_evaluator.get_scores(summary, text_to_summarize)
    
    if isinstance(rouge_scores, list):
        rouge_scores = rouge_scores[0]
    
    reference_tokens = text_to_summarize.split()
    candidate_tokens = summary.split()
    bleu_score = sentence_bleu([reference_tokens], candidate_tokens)
    
    P, R, F1 = bert_score.score(
        [summary], 
        [text_to_summarize], 
        rescale_with_baseline=True, 
        lang='en'
    )
    
    original_len = len(text_to_summarize.split())
    summary_len = len(summary.split())
    compression_ratio = summary_len / original_len if original_len > 0 else 0
    
    readability = textstat.flesch_reading_ease(summary)
    
    results = {}
    results['model_name'] = model_name
    results['uuid'] = uuid
    results['companyid'] = company_id
    results['companyname'] = company_name

    for metric, scores in rouge_scores.items():
        results[f'{metric}_r'] = scores['r']
        results[f'{metric}_p'] = scores['p']
        results[f'{metric}_f'] = scores['f']
    
    results['bleu'] = bleu_score
    results['bert_precision'] = P.item()
    results['bert_recall'] = R.item()
    results['bert_f1'] = F1.item()
    results['compression_ratio'] = compression_ratio
    results['readability'] = readability
    
    return pd.DataFrame([results])

In [None]:
evaluation_results = pd.DataFrame()

for _, row in tqdm(df.iterrows()):
    evaluation_result = evaluate_summary(row, model_name)
    evaluation_results = pd.concat([evaluation_results, evaluation_result], ignore_index=True)

evaluation_results.head()

In [None]:
csv_filename = Path('..') / 'data' / 'evaluation_results.csv'

if os.path.exists(csv_filename):
    existing_df = pd.read_csv(csv_filename)
    if ((existing_df.model_name == model_name) & (existing_df.companyid == company_id)).any():
        logging.info(f'model {model_name} and {company_id} combination already exists in {csv_filename}. no new row added.')
        updated_df = existing_df
    else:
        updated_df = pd.concat([existing_df, evaluation_results], ignore_index=True)
        logging.info(f'model {model_name} not found. appending new row to {csv_filename}.')
else:
    updated_df = evaluation_results
    logging.info(f'{csv_filename} not found. creating new file.')

updated_df.to_csv(csv_filename, index=False)
logging.info(f'results saved to {csv_filename}')

In [None]:
def visualize_results(df):
    '''
    Visualizes evaluation metrics stored in a DataFrame.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing evaluation results with numeric metric columns.
    
    This function produces:
    - Histograms for each numeric metric.
    - A correlation heatmap of the numeric metrics.
    '''
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    
    df[numeric_cols].hist(bins=20, figsize=(15, 10))
    plt.suptitle('Histograms of Evaluation Metrics', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()
    
    plt.figure(figsize=(12, 10))
    correlation_matrix = df[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap of Evaluation Metrics', fontsize=16)
    plt.show()

visualize_results(evaluation_results)

In [None]:
df.to_csv(
        Path('..') / 'data' / 'summaries' / f'{company_id}_{model_name}.csv'.replace('/','-'),
        sep='\t',
        index=False,
        quoting=1,
        escapechar='\\',
        doublequote=True,
        quotechar='"',
        lineterminator='\n'
    )

In [None]:
print(df.full_text[0])

In [None]:
df[summary_column][0]

In [None]:
model_process.terminate()
ollama_server_process.terminate()