# LLM Extraction

Leveraging modern LLM capabilities I want to try to extract in an structured manner all the degrees completed by the subjects

In [1]:
# Imports
from pydantic import BaseModel, Field
from typing import List
from langchain import LLMChain, PromptTemplate
from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.llms import LlamaCpp


## Setup

In [2]:
# Pydantic will be used for data validation

class Degree(BaseModel):
    Degree_type: str = Field(..., description="e.g. 'Bachelor of Arts', 'Professional', etc.")
    Degree_field: List[str] = Field(..., description="e.g. ['Political Science'], ['jd']")

class AuthorDegrees(BaseModel):
    id: int
    author_name: str
    degrees: List[Degree]

## Langchain

In [13]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="llama3.1:8b")

llm.invoke("The first man on the moon was ...")

'...Neil Armstrong! He stepped out of the lunar module Eagle and onto the moon\'s surface on July 20, 1969, famously declaring "That\'s one small step for man, one giant leap for mankind" as he became the first person to set foot on the moon.'

In [28]:
from llama_cpp import Llama
llm = Llama(
    model_path="../models/llama-2-7b-chat.Q5_K_M.gguf",
    n_gpu_layers=1,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    verbose=True,
)

llm.create_chat_completion(
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant that outputs in JSON.",
        },
        {"role": "user", "content": "Who won the world series in from 2020 to 2023?"},
    ],
    response_format={
        "type": "json_object",
        "schema": [{
            "type": "object",
            "properties": {"team_name": {"type": "string"}},
            "required": ["team_name"],
        }],
    },
    temperature=0.7,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama-2-7b-chat.Q5_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32      

{'id': 'chatcmpl-bd3d7445-e600-4f00-996e-ed81e3f76bac',
 'object': 'chat.completion',
 'created': 1747079648,
 'model': '../models/llama-2-7b-chat.Q5_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': '{"World Series Winners": [\n{\n"Year": 2020,\n"Winner": "Los Angeles Dodgers"\n},\n{\n"Year": 2021,\n"Winner": "Atlanta Braves"\n},\n{\n"Year": 2022,\n"Winner": "Houston Astros"\n},\n{\n"Year": 2023,\n"Winner": "New York Yankees"\n}\n]\n}'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 48, 'completion_tokens': 115, 'total_tokens': 163}}

## PydanticAI

In [None]:
import asyncio
from pydantic import BaseModel, Field
from typing import List
from pydantic_ai.models.openai import OpenAIModel
from pydantic_ai.providers.openai import OpenAIProvider
from pydantic_ai import Agent
import nest_asyncio
from httpx import AsyncClient
from pydantic_ai.providers.deepseek import DeepSeekProvider
import os
import dotenv
dotenv.load_dotenv()
nest_asyncio.apply()
class Degree(BaseModel):
    degree_type: str = Field(..., description="…")
    degree_field: List[str] = Field(..., description="…")

class AuthorDegrees(BaseModel):
    studies: List[Degree]


async def extract_degrees_async(
    id: int,
    author_name: str,
    sentences: str,
    model_name: str = "llama3.1:8b",
) -> AuthorDegrees:
    schema_json = AuthorDegrees.model_json_schema()
    prompt = f"""
Goal: Identify and categorize academic degrees from a Wikipedia text snippet. Respond only with JSON matching the AuthorDegrees schema (no extra text or markdown).:
Steps:
1. Scan for keywords (e.g., B.A., M.A., Bachelor, Master, Doctor).
2. Identify degree types (e.g., Bachelor of Arts, Juris Doctor, Ph.D.) and their fields (e.g., History, Law).
3. Use the long version of the degree name (e.g., Bachelor of Arts, Juris Doctor) and include the field of study if available.
4. Detect incomplete degrees (“dropped out”, “did not graduate”, and similar) and ignore.
5. If no completed degree is found, return a single entry with NONE
6. Output schema (exact JSON):

Examples:
Q1: "- He received his Bachelor of Arts from Harvard College in 1980.
    - He received his Juris Doctor from the University of California at Berkeley in 1986."
A1:[
  {{
    "studies": [
      {{
        "degree_type": "Bachelor of Arts",
        "degree_field": []
      }},
      {{
        "degree_type": "Professional",
        "degree_field": ["Juris Doctor"]
      }}
    ]
  }}
]
Q2: "## Academic career
- After attending University City High School in St. Louis, Missouri , Moyn earned his A.B.
- He continued his education, earning a Ph.D. from the University of California at Berkeley (2000) and his J.D.

## _infobox_education_
- Washington University in St. Louis ( BA ) University of California, Berkeley ( PhD ) Harvard University ( JD )
"
A2:[
  {{
    "studies": [
      {{
        "degree_type": "Bachelor of Arts",
        "degree_field": ["History", "French literature"]
      }},
      {{
        "degree_type": "Ph.D.",
        "degree_field": []
      }},
      {{
        "degree_type": "Professional",
        "degree_field": ["Juris Doctor"]
      }}
    ]
  }}
]
Q3: "## Early life
- Foster graduated from Falmouth Academy in 1994.
"
A:[
  {{
    "studies": [
      {{
        "degree_type": "NONE",
        "degree_field": []
      }}
    ]
  }}
]

Q:"{sentences}"
"""

    ollama_model = OpenAIModel(
        model_name=model_name,
        provider=OpenAIProvider(base_url='http://localhost:11434/v1'),
    )
    
    custom_http_client = AsyncClient(timeout=30)
    deepseek_model = OpenAIModel(
    'deepseek-chat',
    provider=DeepSeekProvider(
        api_key=os.getenv("DEEPSEEK_API_KEY"), http_client=custom_http_client
        ),
    )
    
    agent = Agent(deepseek_model, output_type=AuthorDegrees, max_result_retries=3)
    # This is an async call we can await
    result: AuthorDegrees = await agent.run(prompt)
    return result

# In a notebook or any async-capable REPL, you can just:
# >>> result = await extract_degrees_async(1, "Harry Litman", sample_sentences)
# >>> print(result.json(indent=2))

# Or, if you really want to wrap it for sync usage:
def extract_degrees(*args, **kwargs):
    return asyncio.get_event_loop().run_until_complete(
        extract_degrees_async(*args, **kwargs)
    )


if __name__ == "__main__":
    sample_sentences = (
        "- He received his Bachelor of Arts from Harvard College in 1980.\n"
        "- He received his Juris Doctor from the University of California at Berkeley in 1986."
    )
    result = await extract_degrees_async(1, "Harry Litman", sample_sentences)
    print(result.output)

AgentRunResult(output=AuthorDegrees(studies=[Degree(degree_type='Bachelor of Arts', degree_field=[]), Degree(degree_type='Professional', degree_field=['Juris Doctor'])]))


In [None]:
print(result.output.studies)


[Degree(degree_type='Bachelor of Arts', degree_field=[]), Degree(degree_type='Professional', degree_field=['Juris Doctor'])]
