# use Mistral-large-latest to get structured JSON output

This is a proof of concept that uses Mistral to extract information and return it into a JSON pre-defined schema

In [1]:
import getpass
import os

os.environ["MISTRAL_API_KEY"] = getpass.getpass()

from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(model="mistral-large-latest")

 ········


# 1. set up pydantic pre-defined model

In [10]:
from pydantic import BaseModel, Field
import os
from pathlib import Path
import json

In [11]:
class Measurement(BaseModel):
    measurement_type: str = Field(
        description="The type of measurement (e.g., Relative Humidity, Specific Humidity)."
    )
    description: str = Field(
        description="Description of how the measurement is computed or estimated."
    )
    formula: list[str] | None = Field(
        description="Formula(s) used for computation, if applicable."
    )
    variables: list[str] | None = Field(
        description="Variables used in the formulas."
    )
    notes: str | None = Field(
        description="Additional notes about the measurement, such as handling of missing data."
    )

class WeatherData(BaseModel):
    time_period: str | None = Field(
        description='Time in which weather data was collected, or time window in which environmental variables were monitored',
        examples=['Precipitation was collected during the monsoon season', 'monthly weather data were monitored between 2001 and 2012']
    )
    details: list[str] | None = Field(
        description='Any sentence from the paper describing any info related to weather data that cannot be captured well in the above fields',
        examples='Precipitations in Bangladesh were scarse during the 2005 fall'
    )
    measurement: list[Measurement] = Field(
        description="List of different measurements and methodologies used in the study."
    )
    
#%%
class Methodology(BaseModel):
    name: str | None = Field(
        description='Name of the methodology used',
        examples=['t-test', 'signal processing']
    )
    description: str | None = Field(
        description='Literal text, Description of the methodology'
    )
    methods: list[str] | None = Field(
        description='Specific methods or techniques used within the methodology',
        examples='Fast fourier transformation'
    )
    goals: list[str] | None = Field(
        description='Goals or purposes of using the methodology'
    )

class StudyScope(BaseModel):
    study_sites: list[str] | None = Field(
        description='sites studied and mentioned in the paper',
        examples=['The Gambia', 'Mali']
    ),
    description: list[str] | None = Field(
        description='Sites/setting description in sentences which not necessarily contain the name of the site',
        examples='resource limited setting'
    ),
    locations: list[str] = Field(
        description='Names of the actual locations'
    )


class StudyDuration(BaseModel):
    duration: str | None = Field(
        description='The duration of the study described in the paper',
        examples=['three years', '6 months', 'the study ran over 10 years']
    )
    details: list[str] | None  = Field(
        description='Actual sentences whereby the duration is mentioned. It is possible in this field to have multiple durations mentioned'
    )

class ParticipantAgeGroup(BaseModel):
    age_range: str | None = Field(
        description='Age range of the population described in the paper. If there is no range, also age in months/years is fine',
        examples=['36 months old', 'children younger than five years of age']
    )
    details: list[str] | None = Field(
        description='Any descriptive sentence that details who the participants were, how old they were and any other information that relates to them',
        examples=['Children 0–59 months of age with moderate-to-severe diarrhea (MSD)', 'Only the first 8–9 children in each age strata (0–11 months, 12–23 months, 24–59 months) were recruited']
    )




class Bibliography(BaseModel):
    article_type: str | None = Field(
        description='Article type', 
        examples=['Research article', ' meta-analysis', 'review', 'opinion paper']
    )
    study_type: str | None = Field(
        description='Type of study',
        examples=['retrospective study']
    )
    title: str = Field(
        description='Title of the paper'
    )
    study_scope: StudyScope = []
    methodology: list[Methodology] = []
    study_duration: StudyDuration = [] 
    participant_age_group: ParticipantAgeGroup = []
    weather_data: WeatherData = [] 
    data_collection: str = Field(
        description='When data was collected?',
        examples=['during 2008 to 2011']
    )
    authors: str = Field(
        description='Authors of the paper. Usually listed below the title in the first page of the paper',
        examples=['Roose, A., Washington, D., Aniston, J.A.']
    )
    affiliation: str = Field(
        description='Affiliations of the authors listed in the authors field',
        examples=['Institute for Disease Modeling, Bellevue, Washington', 'International School for Advanced Studies, Trieste, Italy']
    )
    citation: str | None = Field(
        description='Citation of the paper which typically includes the first author surname followed by et al., and the year of publication along with the name of the journal',
        examples=['Roose et al., 2008, BMJ']
    )
    corresponding_author: str = Field(
        description='Author that has made available their email for further contact',
        examples=['r.roosevelt@gmail.com']
    )
    doi: str | None 
    date: str = Field(
        description='Date in which the paper has been published comprising typically day, month and year',
        examples=['06-08-2013']
    )
    github_repo: str | None = Field(
        description='Github repository where code and data are available.',
        examples='https://github.com/papermanuscript/main'
    )
    journal: str = Field(
        description='Name of the journal in which the paper has been published, tipically listed in the front page, nearby the title',
        examples= 'PLoS Negl Trop Dis'
    )

# 2. parse the model with pydantic

In [12]:
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [13]:
parser = PydanticOutputParser(pydantic_object=Bibliography)

In [14]:
# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
        ),
        ("human", "{query}"),
    ]
).partial(format_instructions=parser.get_format_instructions())



In [15]:
prompt_query = (f"You are provided with an excerpt of a text: 'Weather and notified Campylobacter infections in temperate  and sub-tropical regions of Australia  Abstract  Background   The relationship between weather and food-borne diseases has been of great concern  recently. However, the impact of weather variations on food-borne disease may vary in  different areas with various geographic, weather and demographic characteristics. This  study was designed to quantify the relationship between weather variables and  Campylobacter infections in two Australian cities with different local climatic conditions.  Methods   An ecological-epidemiological study was conducted, using weekly disease surveillance  data and meteorological data, over the period 1990-2005, to quantify the relationship  between maximum and minimum temperature, rainfall, relative humidity and  notifications of Campylobacter infections in Adelaide, with a temperate Mediterranean  climate, and Brisbane, with a sub-tropical climate. Spearman correlation and time-series  adjusted Poisson regression analyses were performed taking into account seasonality, lag  effects and long-term trends.   Results   The results indicate that weekly maximum and minimum temperatures were inversely  associated with the weekly number of cases in Adelaide, but positively correlated with  the number of cases in Brisbane, with relevant lagged effects. The effects of rainfall and  relative humidity on Campylobacter infection rates varied in the two cities.'")

In [16]:
# this is a bit silly but I prefer to keep it so that I can trace back the initial example by googling the code ;)
query = prompt_query

To get the output we can build a 'chain' in which the output gets wrapped in a more consistent way, so that it's easier to save. Unfortunately though the output (while still a json) it's not really readable? we must use a function to parse it into a json indentation

In [17]:
chain = prompt | llm | parser

chain.invoke({"query": prompt_query})

Bibliography(article_type='Research article', study_type='ecological-epidemiological study', title='Weather and notified Campylobacter infections in temperate and sub-tropical regions of Australia', study_scope=StudyScope(study_sites=['Adelaide', 'Brisbane'], description=['Adelaide, with a temperate Mediterranean climate', 'Brisbane, with a sub-tropical climate'], locations=['Adelaide', 'Brisbane']), methodology=[Methodology(name='Spearman correlation and time-series adjusted Poisson regression analyses', description='An ecological-epidemiological study was conducted, using weekly disease surveillance data and meteorological data, over the period 1990-2005, to quantify the relationship between maximum and minimum temperature, rainfall, relative humidity and notifications of Campylobacter infections.', methods=['Spearman correlation', 'time-series adjusted Poisson regression analyses'], goals=['Quantify the relationship between weather variables and Campylobacter infections', 'Account f

# 3. parse output into a human-readable json

In [18]:
from langchain_core.messages import AIMessage
from typing import List
import re

def extract_json(message: AIMessage) -> List[dict]:
    """Extracts JSON content from a string where JSON is embedded between ```json and ``` tags.

    Parameters:
        text (str): The text containing the JSON content.

    Returns:
        list: A list of extracted JSON strings.
    """
    text = message.content
    # Define the regular expression pattern to match JSON blocks
    pattern = r"```json(.*?)```"

    # Find all non-overlapping matches of the pattern in the string
    matches = re.findall(pattern, text, re.DOTALL)

    # Return the list of matched JSON strings, stripping any leading or trailing whitespace
    try:
        return [json.loads(match.strip()) for match in matches]
    except Exception:
        raise ValueError(f"Failed to parse: {message}")

In [19]:
chain2 = prompt | llm | extract_json

chain2.invoke({"query": prompt_query})

[{'article_type': 'Research article',
  'study_type': 'ecological-epidemiological study',
  'title': 'Weather and notified Campylobacter infections in temperate and sub-tropical regions of Australia',
  'study_scope': {'study_sites': ['Adelaide', 'Brisbane'],
   'description': ['Adelaide, with a temperate Mediterranean climate',
    'Brisbane, with a sub-tropical climate'],
   'locations': ['Adelaide', 'Brisbane']},
  'methodology': [{'name': 'Spearman correlation',
    'description': 'Spearman correlation analysis was performed.',
    'methods': ['Spearman correlation'],
    'goals': ['Quantify the relationship between weather variables and Campylobacter infections']},
   {'name': 'Time-series adjusted Poisson regression',
    'description': 'Time-series adjusted Poisson regression analysis was performed.',
    'methods': ['Time-series adjusted Poisson regression'],
    'goals': ['Quantify the relationship between weather variables and Campylobacter infections']}],
  'study_duration':