<a href="https://colab.research.google.com/github/Gjeffroy/Mistral7b_scientific_article/blob/main/Mistral_7b_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! python -m pip install --upgrade pip
! pip install langchain
! pip install unstructured
! pip install "unstructured[pdf]"
! pip install pyhere
! pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
! pip install -q peft accelerate bitsandbytes safetensors sentencepiece
! pip install transformers==4.35
! pip install deep-translator

In [None]:
import os
from langchain.document_loaders import UnstructuredFileLoader
from pyhere import here
from pathlib import Path
from datetime import datetime
from deep_translator import GoogleTranslator


def extract_text_with_langchain_pdf(pdf_file):

    loader = UnstructuredFileLoader(pdf_file)
    documents = loader.load()
    pdf_pages_content = '\n'.join(doc.page_content for doc in documents)

    return pdf_pages_content

my_drive = Path("drive","MyDrive")
pdf_folder = Path(my_drive,"pdf")
pdf_files = [Path(pdf_folder, pdf) for pdf in os.listdir(pdf_folder)]
pdf_files.sort()

In [None]:
extracts = {}
for pdf_file in pdf_files:
    pdf_name = str(pdf_file).split("/")[-1]
    print(pdf_name)
    extracts[pdf_name] = extract_text_with_langchain_pdf(pdf_file)

Document 1.pdf


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Document 2.pdf
Document 7.pdf
Document 4.pdf
Document 10.pdf
Document 9.pdf
Document 3.pdf
Document 6.pdf
Document 8.pdf
Document 5.pdf
Document 11.pdf


In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = '01jonathanf/Mistral-7B-Instruct-v0.2-sharded2GB'

def load_quantized_model(model_name: str):
    """b
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )

    return model

def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer


model = load_quantized_model(model_name)
tokenizer = initialize_tokenizer(model_name)

Downloading config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [None]:
import json
import os
import datetime
from pathlib import Path

def create_prompt(preprompt, content):

  prompt = f"""[INST]

  {preprompt}

  scientific paper content : <article>{content}</article>

  [/INST]"""

  return prompt

def create_prompt_translate(preprompt, content):

  prompt = f"""[INST]

  {preprompt}:

  {content}

  [/INST]"""

  return prompt

def process_request(prompt, temperature = 1):

  encoded = tokenizer(prompt,
                      return_tensors="pt",
                      add_special_tokens=False)
  model_input = encoded
  generated_ids = model.generate(**model_input,
                                 temperature = temperature,
                                 max_new_tokens=5000,
                                 do_sample=True)
  decoded = tokenizer.batch_decode(generated_ids)
  torch.cuda.empty_cache()

  return decoded[0]

In [None]:
# preprompts = {
#     'kid' : """
#     You are a chatbot aiming to make people laugh while making accessible to a
#     large audience complexe scientific papers. Read the content of the article
#     delimited by triple ticks, which is an extract of a scientific publication
#     in pdf format and propose a funny summary that explains it in simple terms.""",
#     'normal' : """
#     You are a chatbot aiming to make accessible to a
#     large audience complexe scientific papers. Read the content of the article
#     delimited by triple tick, which is an extract of a scientific publication
#     in pdf format and propose a summary that explains it in simple terms.
#     Follow these steps to awnser the problem :
#     1. read the scientific paper content
#     2. provide an awnser following the format below

#     ---
#     Format of the awnser:
#     Title : <title of the article>
#     Journal : <name of the journal>
#     Authors : <authors of the paper>
#     Summary : <summary of the article in 100 words>
#     Detailled summary : <summary of the article in 300 words>
#     Fields : <a list of application fields separeted with comma>
#     outputs : <a numbered list of the key takeaways of the article>
#     Impact on normal people's life : <how this discovery may impact normal people in daily life>
#     ---
#     """,
#     'joke' : """
#     You are a comedian who write jokes about scientific paper.
#     The article is delimited by triple ticks.
#     follow the process below :
#     1. Summarise the article in 100 words max
#     2. Invent a joke that use some of the important key words
#     3. return your awnser as follow:

#     Joke : <Joke>
#     """,
#     'teacher' : """you are a teacher that attempt to explain a complexe scientific paper to a class of middle school student.
#     The content of the paper is below delimited by triple tick. Always start you awnser by 'Hello class! Today, we will talk about a scientific paper entitled ...' """,
#     'teacher_subconcept' : """you are a teacher that attempt to explain a complexe scientific paper to a class of middle school student.
#     The content of the paper is below delimited by triple tick. Always start you awnser by 'Hello class! Today, we will talk about a scientific paper entitled ...'.
#     Each time the explanation called upon a complex word, you explain it within brackets. Go Go go !!! """,
#      'teacher_subconcept_list' : """you are a teacher that attempt to explain a complexe scientific paper to a class of middle school student.
#     The content of the paper is below delimited by triple tick. Always start you awnser by 'Hello class! Today, we will talk about a scientific paper entitled ...'.
#     Each time the explanation called upon a complex word or concept, you developp it as a new paragraph. To finish, list all the complex concepts under the explanation. Go Go go !!! """,
#      'key_concept' : """Extract all the key concept of the scientific paper provided below delimited by triple tick. Go Go go !!! """

#        }

In [None]:
preprompts = {
    "vulgarisateur": """
    Tu es un vulgarisateur scientifique qui lit des articles scientifiques et propose un résumé compréhensible par le plus grand nombre.
    Expliques les découvertes principale de l'article pour des enfants.
    Ta réponse doit être fun et rigolote.
    N'hésites pas à utiliser des images ou des métaphores pour expliquer les concepts compliqués de l'article.
    L'article scientifique est delimité par les balises <article>...<\article>.

    Pour réaliser cette tache, suit le processus suivant :
    1. Lit l'article en anglais ci dessous qui commence par la balise <article> et finis par la balise <\article>
    2. Nettoye l'article (il s'agit d'un extrait de PDF)
    3. Isole les concepts clefs
    4. Explique les concepts clefs pour qu'un enfant puisse les comprendre
    5. Traduit la réponse en français. Cette etape obligatoire. Tu ne dois pas t'exprimer en Anglais sous aucun pretexte.""",
    "vulgaristeur_en": """
      You are a science communicator who reads scientific articles
      Your goal is to explain the main discoveries of the article so that children may understand.
      Don't hesitate to use metaphors to explain the complicated concepts of the article.
      The scientific article is delimited by the tags <article>...</article>.

      To complete this task, follow the following process:

      1.Read the article in English below, which starts with the <article> tag and ends with the </article> tag.
      2.Clean the article (it is an extract from a PDF).
      3.Isolate the key concepts.
      4.Explain the key concepts so that a child can understand them.
      5.Provide your awnser in english within a 100 tokens max and delimited by the tag <anwser>.
    """}

In [None]:
traduction_prompts = {"traduction_fr" : "Translate the following text in French"}

In [None]:
from collections import defaultdict
import os
from datetime import datetime

pdf_names = list(extracts.keys())
versions = ['vulgaristeur_en']
traduction_prompt = traduction_prompts["traduction_fr"]
max_length = 25000

# run chatbot
responses = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
responses['meta']['max_length'] = max_length
for version in versions:
  preprompt = preprompts[version]

  # store prompt
  responses[version]['meta']['preprompt'] = preprompt
  for pdf_name in pdf_names:
    print(pdf_name)
    content = extracts[pdf_name][0:(max_length - len(preprompt))]

    # etape de resumé
    tic = datetime.now()
    prompt = create_prompt(preprompt, content)
    response = process_request(prompt).split('[/INST]')[-1]
    tac = datetime.now()
    responses[version][pdf_name]['response_time'] = tac - tic

    # etape de traduction
    tic = datetime.now()
    translated = GoogleTranslator(source='en', target='fr').translate(response)
    tac = datetime.now()
    responses[version][pdf_name]['translation_time'] = tac- tic

    # store responses and content
    responses[version][pdf_name]['response'] = response
    responses[version][pdf_name]['translation'] = translated
    responses[version][pdf_name]['content'] = content

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 1.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 2.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 7.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 4.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 10.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 9.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 3.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 6.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 8.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 5.pdf


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Document 11.pdf


In [None]:
import markdown

def format_results(responses):
  markdown_str = ""
  for version in responses:
    print(version)
    if version != 'meta':
      markdown_str = markdown_str + '<h1> ' + version + '</h1><br><br>'
      for pdf_name in responses[version]:
        print(pdf_name)
        if pdf_name != 'meta':
          markdown_str = markdown_str + '<h2> ' + pdf_name + '</h2><br><br>'

          markdown_str = markdown_str + '<h3> ' + 'reponse orginale :' + '</h3><br><br>'
          markdown_str = markdown_str + '<h4> ' + 'temps de process :' + str(responses[version][pdf_name]['response_time']) + '</h4><br><br>'
          markdown_str = markdown_str + responses[version][pdf_name]['response']+ '<br><br>'

          markdown_str = markdown_str + '<h3> ' + 'reponse traduite :' + '</h3><br><br>'
          markdown_str = markdown_str + '<h4> ' + 'temps de process :' + str(responses[version][pdf_name]['translation_time']) + '</h4><br><br>'
          markdown_str = markdown_str + responses[version][pdf_name]['translation']+ '<br><br>'


  return markdown.markdown(markdown_str)

# folder to save result
mydir = Path(my_drive, "responses", datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
os.makedirs(mydir)

html_string = format_results(responses)
with open(Path(mydir,'responses.html'), 'w') as f:
    f.write(html_string)

meta
vulgaristeur_en
meta
Document 1.pdf
Document 2.pdf
Document 7.pdf
Document 4.pdf
Document 10.pdf
Document 9.pdf
Document 3.pdf
Document 6.pdf
Document 8.pdf
Document 5.pdf
Document 11.pdf


In [None]:
torch.cuda.empty_cache()