In [None]:
!pip install bs4
!pip install pandas
!pip install torch
!pip install transformers

In [ ]:
import requests
from bs4 import BeautifulSoup
import csv
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import re
import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")


In [ ]:

url='https://www.nice.org.uk/guidance/published?ndt=Guidance&ngt=Technology%20appraisal%20guidance&ps=9999'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'DNT': '1',  # Do Not Track Request Header
    'Connection': 'keep-alive'
}

# Make the GET request with the specified headers
response = requests.get(url, headers=headers)
html_content = response.text

soup = BeautifulSoup(html_content, 'html.parser')

rows = soup.find_all('tr')

with open('evidence_NICE_lists.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Write the CSV header
    csvwriter.writerow(['Title', 'Link', 'TA Number', 'Publication Date', 'Last Reviewed'])

    # Iterate over each row in the table
    for row in rows:
        # Extract data from columns
        cols = row.find_all('td')
        if cols:
            a_tag = cols[0].find('a')
            title = a_tag.text.strip() if a_tag else ''
            link = a_tag['href'] if a_tag else ''
            ta_number = cols[1].text.strip()
            publication_date = cols[2].text.strip()
            last_reviewed = cols[3].text.strip()

            # Write the row data to the CSV file
            csvwriter.writerow([title, link, ta_number, publication_date, last_reviewed])

print("CSV file with links has been created successfully.")


In [ ]:

def get_url_data(extension):
    url = f'''https://www.nice.org.uk{extension}'''

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'DNT': '1',  # Do Not Track Request Header
        'Connection': 'keep-alive'
    }


    # Make the GET request with the specified headers
    response = requests.get(url, headers=headers)
    if response.status_code==200:

        html_content = response.text
        return html_content
    else:
        return None
def get_guidance_menu_links(soup):
    guidance_menu = soup.find('nav', class_='stacked-nav')
    final_links=[]
    if guidance_menu:
        links = guidance_menu.find_all('a')
        for link in links[1:]:
            final_links.append([link.get_text(strip=True),link.get('href')])
        return final_links
    else:
        return []

def classify_nice_guidance(text):
    # Categories based on key phrases
    if 'cancer drugs fund' in text.lower():
        CDF = True
    else:
        CDF = False
    if "is not recommended" in text:
        return "Not Recommended", CDF
    # elif "recommended as an option for treating" in text and "Cancer Drugs Fund" in text:
    #     return "Recommended-CDF"
    elif "recommended as an option for" in text and "only if" in text:
        return "Optimised",CDF
    # elif "recommended for use" and "Cancer Drugs Fund" in text:
    #     return "Optimised-CDF"
    elif "recommended, within its marketing authorisation" in text or "recommended" in text:
        return "Recommended",CDF
    else:
        return "Uncategorized",CDF

def get_llm_output(prompt):
    tur_inputs = tokenizer.encode(prompt, return_tensors="pt")
    tur_outputs = model.generate(tur_inputs, max_new_tokens=128)
    return tokenizer.decode(tur_outputs[0],skip_special_tokens = True)

def LLM_questions(question_type,MEDICAL_DOCUMENT):
    if question_type=='outcome_tech':
      prompt=f"Context:{MEDICAL_DOCUMENT} \n Question: Which tecnology is recommended? answer: "
      gen_output=get_llm_output(prompt)

    elif question_type=='outcome_dis':
      prompt=f"Context:{MEDICAL_DOCUMENT} \n Question: For treating which condition? answer: "
      gen_output=get_llm_output(prompt)

    elif question_type=='outcome_text':
      prompt=f"Context:{MEDICAL_DOCUMENT} \n Question: What are the constraints? answer:  "
      gen_output=get_llm_output(prompt)

    elif question_type=='reason_text':
      prompt=f"Context:{MEDICAL_DOCUMENT} \n Question: what are the reasons? answer: "
      gen_output=get_llm_output(prompt)
    elif question_type=='initial_auth':
      prompt=f"Context:{MEDICAL_DOCUMENT} \n Question: For which condition it was initial authorized? answer: "
      gen_output=get_llm_output(prompt)
    elif question_type=='initial_condition':
      prompt=f"Context:{MEDICAL_DOCUMENT} \n Question: Under what conditions is the treatment initially authorized? answer: "
      gen_output=get_llm_output(prompt)
    elif question_type=='price_text':
      prompt=f"Context:{MEDICAL_DOCUMENT} \n Question: what is the price for treatment? answer: "
      gen_output=get_llm_output(prompt)

    return gen_output

def get_recommendation_reason(div_soup):
    reason_text=''
    recommendation_text = ''
    recommendation_cat=''
    CDF=False
    treatment,condition,constraints,reasons_llm='','','',''
    if div_soup.find('div'):
        recommendation_text=div_soup.find('div').get_text(strip=True)
        treatment=LLM_questions('outcome_tech',recommendation_text)
        condition=LLM_questions('outcome_dis',recommendation_text)
        constraints=LLM_questions('outcome_text',recommendation_text)
        recommendation_cat,CDF=classify_nice_guidance(recommendation_text)
        strong_tag = div_soup.find('strong', string="Why the committee made these recommendations")
        if strong_tag:
            for next_p in strong_tag.find_all_next('p'):
                reason_text += next_p.get_text(strip=True) + " "

        if len(reason_text)>1:
          reasons_llm=LLM_questions('reason_text',reason_text)
        return recommendation_text,recommendation_cat,CDF,reason_text,treatment,condition,constraints,reasons_llm
    else:
        return recommendation_text,recommendation_cat,CDF,reason_text,treatment,condition,constraints,reasons_llm

def extract_size(size_str):
    # Extract the numerical value from the size string
    match = re.search(r'(\d+(\.\d+)?)\s*(MB|KB)', size_str, re.I)
    if match:
        size = float(match.group(1))
        unit = match.group(3).upper()
        if unit == 'KB':
            size = size / 1024  # Convert KB to MB
        return size
    return 0


def get_eol_sm(url):
    response = requests.get(url)
    end_of_life = False
    severity = False
    if response.status_code == 200:
        soup = BeautifulSoup(response.text,'html.parser')
        soup_text= soup.get_text()
        if 'end of life' in soup_text.lower():
            end_of_life=True
        if 'severity' in soup_text.lower():
            severity=True
    return end_of_life,severity

def get_information_medicine(div_soup):
    authorisation=''
    auth_condition=''
    auth_treatment=''
    price_value=''
    dosage=''
    price=''
    if div_soup.find('div',title='Marketing authorisation indication'):
        authorisation=div_soup.find('div',title='Marketing authorisation indication').find('p').get_text(strip=True)
        auth_treatment=LLM_questions('initial_auth',authorisation)
        auth_condition=LLM_questions('initial_condition',authorisation)
    if div_soup.find('div',title='Dosage in the marketing authorisation'):
        dosage=div_soup.find('div',title='Dosage in the marketing authorisation').find('p').get_text(strip=True)
    if div_soup.find('div',title='Price'):
        price=div_soup.find('div',title='Price').find('p').get_text(strip=True)
        price_value=LLM_questions('price_text',price)
    return authorisation,dosage,price,auth_condition,auth_treatment,price_value



In [ ]:
TAR=pd.read_csv('evidence_NICE_lists.csv',sep=',')

TAR['Publication Date'] = pd.to_datetime(TAR['Publication Date'])

TAR = TAR[TAR['Publication Date'] >= '2018-01-01']


In [ ]:
TAR.head(5)

In [ ]:

final_csv=[]

for row in TAR.iterrows():
    title=row[1]['Title']
    TAnumber=row[1]['TA Number']
    print(TAnumber)
    extension=f'''/{"/".join(row[1]['Link'].split("/")[-2:])}'''
    recommendation, reason = 'Not Recommended',''
    authorization, dosage, price= '','',''
    html_content=get_url_data(extension)
    recommendation_cat= 'Not Recommended'
    CDF=False
    end_of_life = False
    severity_modifiers = False
    Technology,condition,constraints,reasons_llm='','','',''
    auth_condition,auth_treatment,price_value='','',''

    # Parse the HTML content
    if html_content:
      soup = BeautifulSoup(html_content, 'html.parser')
      guidance_menus=get_guidance_menu_links(soup)

      url=f'''{row[1]['Link']}/chapter/3-Committee-discussion'''
      end_of_life,severity_modifiers=get_eol_sm(url)

      for guidance_menu_lists in guidance_menus[:2]:
          guidance_menu_list=guidance_menu_lists[1]
          make_id=f'''{guidance_menu_list.split("/")[2]}-{guidance_menu_list.split("/")[4].lower()}'''
          soup = BeautifulSoup(get_url_data(guidance_menu_list),'html.parser')
          if soup.find('div',id=make_id):
              div_soup = soup.find('div', id=make_id)
          else:
              div_soup = soup.find('div', title=guidance_menu_lists[0])
          # Find the recommendation and reason sections
          if "recommendation" in make_id:
              recommendation,recommendation_cat,CDF,reason,Technology,condition,constraints,reasons_llm=get_recommendation_reason(div_soup)
          if 'information-about' in make_id:
              authorization,dosage,price,auth_condition,auth_treatment,price_value=get_information_medicine(div_soup)

      final_csv.append([TAnumber,title,recommendation,Technology,condition,constraints,recommendation_cat,CDF,reason,reasons_llm,authorization,auth_condition, 
                        auth_treatment,dosage,price,price_value,
                        end_of_life,severity_modifiers])


In [ ]:
dd=pd.DataFrame(final_csv)
dd.columns=['TA_Number','Indication','Outcome','Technology','Condition','Constraints','Category','CDF','Reasons','Reason_LLM','Initial Authorization'
                                                       ,'Authorization_Condition','Authorization_Treatment','Dosage','Price','Price_Value','EoL','Severity Modifiers']

dd.to_csv('Recommendations_NICE_papers.tsv',sep=';',index=False)