# Data exploration

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
from pprint import pprint

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Get files from the dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("allen-institute-for-ai/CORD-19-research-challenge")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/CORD-19-research-challenge


In [3]:
%%time
file_dir = '/kaggle/input/CORD-19-research-challenge/document_parses/pdf_json/'
import os
filenames = os.listdir(file_dir)
print("Number of articles retrieved from biorxiv:", len(filenames))

Number of articles retrieved from biorxiv: 401214
CPU times: user 127 ms, sys: 352 ms, total: 480 ms
Wall time: 39.1 s


In [4]:
# select randomly 1000 article
sample_file = np.random.choice(filenames, size=1000, replace=False,)
len(sample_file)

1000

In [5]:
%%time
all_files = []

for filename in sample_file:
    filename = file_dir + filename
    file = open(filename, 'rb')
    # TODO here would be nice to store the file locally in 'kaggle/working/'
    all_files.append(json.load(file))

CPU times: user 1.79 s, sys: 471 ms, total: 2.26 s
Wall time: 12.1 s


## Get previously locally stored sample

In [None]:
#TODO load sample from 'kaggle/working'

## Extract info

In [22]:
file1 = all_files[45]
print("Dictionary keys:", all_files[0].keys())

Dictionary keys: dict_keys(['paper_id', 'metadata', 'abstract', 'body_text', 'bib_entries', 'ref_entries', 'back_matter'])


In [17]:
texts = [(di['section'], di['text']) for di in file1['body_text']]
texts_di = {di['section']: "" for di in file1['body_text']}
for section, text in texts:
    texts_di[section] += text

#pprint(list(texts_di.keys()))

In [13]:
pprint(file1['abstract'])

[{'cite_spans': [],
  'ref_spans': [],
  'section': 'Abstract',
  'text': 'The COVID -19 pandemic posed serious challenge for securing public '
          'health worldwide. Public health preparedness and restrictions put '
          'in place impacted many aspects of human life, including '
          'recreational activities and access to outdoor recreational '
          'destinations. Green spaces have become one of the few sources of '
          'resilience during the coronavirus crisis due to their restorative '
          'effects on psychophysical health and community well-being. The aim '
          'of this study is to analyse the impact of the COVID -19 pandemic on '
          'forest visitation. The results are based upon long-term visitor '
          'data acquired via pyroelectric sensors (Eco-Counter) in three '
          'forest districts located in Poland (Browsk, Gdansk & Kozienice '
          'Forest Districts). The analysis covers the period between '
          '01.01.20

In [6]:
def process_article(file):
    body = ""
    
    body += file['metadata']['title']
    body += "\n\n"
    if len(file['abstract']) > 0:
        body += file['abstract'][0]['section']
        body += "\n\n"
        body += file['abstract'][0]['text']
        body += "\n\n"
    
    texts = [(di['section'], di['text']) for di in file['body_text']]
    texts_di = {di['section']: "" for di in file['body_text']}
    for section, text in texts:
        texts_di[section] += text
    
    for section, text in texts_di.items():
        body += section
        body += "\n\n"
        body += text
        body += "\n\n"

    return body
    


In [7]:
docs = []
for f in all_files:
    try:
        docs.append(process_article(f))
    except:
        pprint(f)
        exit()


In [8]:
len(docs)
print(docs[3])

Leadership lessons from administrators, faculty, and students during the COVID-19 pandemic

Abstract

Introduction: Due to the largescale scope of the COVID-19 pandemic, strain on the higher education system in the United States has been extraordinary. Yet, with any crisis, there is the opportunity to learn, grow, and develop new knowledge and strategies to benefit educational programs moving forward. The purpose of this study is to describe the leadership lessons learned by academic pharmacy during the COVID-19 pandemic from the perspective of administrators, faculty, and students.

Introduction

The COVID-19 pandemic has been one of the largest health crises in modern history. The pandemic, caused by a novel coronavirus named SARS-CoV-2, was first identified in Wuhan, China in December 2019. 1 The virus is spread through respiratory droplets that land in the nose or mouth of nearby people, which are then inhaled into the lungs. 2 The spread of the virus among people in close contact 

# Intro to RAG with LLM

In [1]:
# Installing packages
!pip install transformers

!pip install sentence_transformers
!pip install torch
!pip install faiss-cpu

!pip install -q -U immutabledict sentencepiece 



In [2]:
!pip install git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3 




Collecting git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3
  Cloning https://github.com/huggingface/transformers (to revision v4.49.0-Gemma-3) to /tmp/pip-req-build-8j950ch1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-8j950ch1
  Running command git checkout -q 1c0f782fe5f983727ff245c4c1b3906f9b99eec2
  Resolved https://github.com/huggingface/transformers to commit 1c0f782fe5f983727ff245c4c1b3906f9b99eec2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [5]:
import kagglehub
import torch
from transformers.models.gemma3 import Gemma3ForConditionalGeneration, Gemma3Processor


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
GEMMA_PATH = kagglehub.model_download("google/gemma-3/transformers/gemma-3-4b-it")
processor = Gemma3Processor.from_pretrained(GEMMA_PATH, use_fast=True)
model = Gemma3ForConditionalGeneration.from_pretrained(GEMMA_PATH, torch_dtype=torch.float16).to(device)
print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Gemma3ForConditionalGeneration(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4096, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (self_attn): SiglipSdpaAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features=1

In [None]:
prompt = """<start_of_turn>user
Write a poem about the Kraken<end_of_turn>
<start_of_turn>model"""
input_ids = processor(text=prompt, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, max_new_tokens=512)
text = processor.batch_decode(
    outputs,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)
print(text[0])

In [15]:
# Importing libraries

# Importing system
import faiss

# Importing Libraries LLM
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM


ImportError: cannot import name 'Gemma3ForCausalLM' from 'gemma.model' (/kaggle/working/gemma_pytorch/gemma/model.py)

In [None]:
model

# Fresh approach with hugging face 

Following [https://huggingface.co/docs/transformers/en/tasks/question_answering](http://) 

In [9]:
!pip install transformers torch pytesseract
!pip install sentencepiece sacremoses

[0mCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [49]:
#from huggingface_hub import notebook_login
#notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
from transformers import pipeline