# PaperBoat and API mini-hackathon

## Repo: https://github.com/lucafusarbassini/paperboat/tree/main
## Author: Luca Fusar Bassini
## Lausanne November 2023
## LauzHack mini-hackathon

In [1]:
import requests
from flask import Flask, request, Response
from scihub import SciHub
import os
import re
import random
import json
import datetime
from bs4 import BeautifulSoup
from pyzotero import zotero
from google.cloud import texttospeech as tts
import wave
import langchain
from crossref.restful import Works
from semanticscholar import SemanticScholar
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
import scipy
import matplotlib.pyplot as plt
from typing import Sequence
import openai
import pandas as pd
import tiktoken
from openai.embeddings_utils import get_embedding
from dotenv import load_dotenv, find_dotenv
import warnings
warnings.filterwarnings("ignore")
import langchain
from langchain_experimental.agents.agent_toolkits import create_python_agent
from langchain.agents import load_tools, initialize_agent
from langchain.agents import AgentType
from langchain_experimental.tools.python.tool import PythonREPLTool
from langchain.python import PythonREPL
from langchain.chat_models import ChatOpenAI
from langchain.agents import tool
from datetime import date
from dotenv import load_dotenv, find_dotenv
from semanticscholar import SemanticScholar

In [11]:
# note: a few useful links
# https://learn.deeplearning.ai/
# https://platform.openai.com/docs/plugins/examples
# https://python.langchain.com/docs/additional_resources/tutorials
# https://github.com/openai/chatgpt-retrieval-plugin
# https://cookbook.openai.com/

In [12]:
# read in all the API keys

# Zotero
try:
    with open('zotero_key.txt', 'r') as file:
        ZOTapi_key = file.read()
except Exception as e:
        print(e)
try:
    with open('zotero_id.txt', 'r') as file:
        library_id = file.read()
except Exception as e:
        print(e)      
library_type = 'user'
zot = zotero.Zotero(library_id, library_type, ZOTapi_key)

# OpenAI
try:
    with open('openai_key.txt', 'r') as file:
        openai_key = file.read()
except Exception as e:
        print(e)

# Telegram
try:
    with open('telegram_key.txt', 'r') as file:
        TOKEN = file.read()
except Exception as e:
        print(e)

# Zapier...

In [13]:
openai.api_key = openai_key

## Scholar research APIs: CrossRef, Semantic Scholar, Zotero

In [14]:
# CROSSREF: title to DOI

def convert_title_to_doi(title):
    url = f"https://api.crossref.org/works?query.title={title}"
    response = requests.get(url)
    if response.status_code == 200:
        results = json.loads(response.content)['message']['items']
        if len(results) > 0:
            return results[0]['DOI']
    return None

print(convert_title_to_doi("RNA velocity of single cells"))

10.1038/s41586-018-0414-6


In [18]:
# ZOTERO: adding to your library

def add_to_zotero(title, doi):
    template = zot.item_template('journalArticle')
    template['DOI'] = doi
    template['title'] = title
    resp = zot.create_items([template])
    return None

add_to_zotero("RNA velocity of single cells", "10.1038/s41586-018-0414-6")

In [16]:
# semantic scholar - https://pypi.org/project/semanticscholar/

# DOI <-> paper
sch = SemanticScholar()
paper = sch.get_paper('10.1038/s41586-018-0414-6')
print(paper.title)

# paper authors
sch = SemanticScholar()
author = sch.get_author(2262347)
print(author.name)

# list of papers
sch = SemanticScholar()
list_of_paper_ids = [
     'CorpusId:470667',
     '10.2139/ssrn.2250500',
     '0f40b1f08821e22e859c6050916cec3667778613'
]
results = sch.get_papers(list_of_paper_ids)
for item in results:
     print(item.title)
        
# paper search by keyword
sch = SemanticScholar()
results = sch.search_paper('Computing Machinery and Intelligence')
for item in results:
     print(item.title)
        
# paper recommendations! - related to a given paper of interest
sch = SemanticScholar()
results = sch.get_recommended_papers('10.1145/3544585.3544600')
for item in results:
     print(item.title)
        
# using also NEGATIVE paper examples
sch = SemanticScholar()
positive_paper_ids = ['10.1145/3544585.3544600']
negative_paper_ids = ['10.1145/301250.301271']
results = sch.get_recommended_papers_from_lists(positive_paper_ids, negative_paper_ids)
for item in results:
     print(item.title)
        
# retrieving the categories (fields) of papers
sch = SemanticScholar()
results = sch.search_paper('software engineering', fields_of_study=['Computer Science','Education'])
print(results[0].s2FieldsOfStudy)

RNA velocity of single cells
A. Turing
How Much Should We Trust Differences-in-Differences Estimates?
The Miracle of Microfinance? Evidence from a Randomized Evaluation
Improving Third-Party Audits and Regulatory Compliance in India
Computing Machinery and Intelligence
Artificial intelligence in the research of consciousness and in social life (in honor of 70-years anniversary of A. Turing’s paper “Computing Machinery and Intelligence” (papers of the “round table”)
Studies on computing machinery and intelligence
On Computing Machinery and Intelligence
Computing Machinery and Intelligence
Alan Turing’s “Computing Machinery and Intelligence”
Computing Machinery and Intelligence A.M. Turing
Computing Machinery and Intelligence
The Annotation Game: On Turing (1950) on Computing, Machinery, and Intelligence (PUBLISHED VERSION BOWDLERIZED)
Alan Turing’s “Computing Machinery and Intelligence”
Computing Machinery and Intelligence (1950)
Computing Machinery and Intelligence Mind Vol. 59
Computi

# Scihub

In [2]:
from scihub import SciHub
import random

random_number = str(random.randint(1, 100000))
sh = SciHub()
result = sh.download("10.1038/s41586-018-0414-6", path="papero"+random_number+".pdf")

## Text-to-voice

In [9]:
def unique_languages_from_voices(voices: Sequence[tts.Voice]):
    language_set = set()
    for voice in voices:
        for language_code in voice.language_codes:
            language_set.add(language_code)
    return language_set

def list_languages():
    client = tts.TextToSpeechClient()
    response = client.list_voices()
    languages = unique_languages_from_voices(response.voices)

    print(f" Languages: {len(languages)} ".center(60, "-"))
    for i, language in enumerate(sorted(languages)):
        print(f"{language:>10}", end="\n" if i % 5 == 4 else "")

def list_voices(language_code=None):
    client = tts.TextToSpeechClient()
    response = client.list_voices(language_code=language_code)
    voices = sorted(response.voices, key=lambda voice: voice.name)

    print(f" Voices: {len(voices)} ".center(60, "-"))
    for voice in voices:
        languages = ", ".join(voice.language_codes)
        name = voice.name
        gender = tts.SsmlVoiceGender(voice.ssml_gender).name
        rate = voice.natural_sample_rate_hertz
        print(f"{languages:<8} | {name:<24} | {gender:<8} | {rate:,} Hz")

In [37]:
# installation for authentication a bit cumbersome
# https://cloud.google.com/docs/authentication/provide-credentials-adc#local-dev
# https://cloud.google.com/sdk/docs/install

# gcloud auth application-default login
# gcloud auth application-default set-quota-project YOUR_PROJECT

def text_to_wav(voice_name: str, text: str):
    language_code = "-".join(voice_name.split("-")[:2])
    text_input = tts.SynthesisInput(text=text)
    voice_params = tts.VoiceSelectionParams(
        language_code=language_code, name=voice_name
    )
    audio_config = tts.AudioConfig(audio_encoding=tts.AudioEncoding.LINEAR16)

    client = tts.TextToSpeechClient()
    response = client.synthesize_speech(
        input=text_input,
        voice=voice_params,
        audio_config=audio_config,
    )

    filename = f"{voice_name}.wav"
    with open(filename, "wb") as out:
        out.write(response.audio_content)
        print(f'Generated speech saved to "{filename}"')
        
text_to_wav("en-US-Neural2-H", "This highly informative series of exclusive scientific sessions will feature keynote presentations by leading researchers, expert panels, and networking opportunities with peers")



PermissionDenied: 403 Your application is authenticating by using local Application Default Credentials. The texttospeech.googleapis.com API requires a quota project, which is not set by default. To learn how to set your quota project, see https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds . [reason: "SERVICE_DISABLED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "texttospeech.googleapis.com"
}
metadata {
  key: "consumer"
  value: "projects/764086051850"
}
]

In [None]:
original_file_name = 'en-US-Neural2-H.wav'
new_file_name = 'comb.wav'
os.rename(original_file_name, new_file_name)
        
for j in range(0,len(week_summaries)):
    try:
        text_to_wav("en-US-Neural2-H", week_summaries[j])
        with wave.open('comb.wav', 'rb') as wav1:
            with wave.open('en-US-Neural2-H.wav', 'rb') as wav2:
                params1 = wav1.getparams()
                params2 = wav2.getparams()
                with wave.open('combined.wav', 'wb') as wav_combined:
                    wav_combined.setparams(params1)
                    data = wav1.readframes(wav1.getnframes())
                    wav_combined.writeframes(data)
                    data = wav2.readframes(wav2.getnframes())
                    wav_combined.writeframes(data)
        os.rename("combined.wav", "comb.wav")
    except Exception as e:
        print(e)

In [None]:
MAX_CHUNK_SIZE = 48 * 1024 * 1024  # 50 MB in bytes

# Open the input file for reading
with wave.open('comb.wav', 'rb') as input_file:
    # Determine the number of output files needed
    num_output_files = math.ceil(input_file.getnframes() * input_file.getsampwidth() / MAX_CHUNK_SIZE)

    # Split the input file into smaller chunks and write each chunk to a separate output file
    for i in range(num_output_files):
        output_file_name = f'comb{i+1}.wav'
        with wave.open(output_file_name, 'wb') as output_file:
            # Set the parameters of the output file to match the input file
            output_file.setnchannels(input_file.getnchannels())
            output_file.setsampwidth(input_file.getsampwidth())
            output_file.setframerate(input_file.getframerate())

            # Determine the number of frames to read from the input file
            num_frames = min(MAX_CHUNK_SIZE // input_file.getsampwidth(), input_file.getnframes() - input_file.tell())

            # Read the frames from the input file and write them to the output file
            frames = input_file.readframes(num_frames)
            output_file.writeframes(frames)

# Print the total size of the output files
total_output_size = sum(os.path.getsize(f'comb{i+1}.wav') for i in range(num_output_files))
print(f'Total size of output files: {total_output_size / (1024 * 1024):.2f} MB')

In [None]:
# alternative to Google - which probably requires google cloud... messy!

import rlvoice
import gtts
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
from playsound import playsound
import time


text = "Leonardo da Vinci lived in France for ten years."
n_trials = 1

# # -- offline
# start = time.time()
# for _ in range(n_trials):
#     engine = rlvoice.init()
#     engine.save_to_file(text, "rlvoice.mp3")
#     engine.runAndWait()
# avg_time = (time.time() - start) / n_trials
# print(f"Average time for rlvoice: {avg_time:.2f} s")

# engine.say(text)
# engine.runAndWait()

# -- online (Google)
start_time = time.time()
for _ in range(n_trials):
    tts = gtts.gTTS(text, tld="com", lang="en", slow=False)
    tts.save("gtts.mp3")
avg_time = (time.time() - start_time) / n_trials
print(f"Average time for gtts: {avg_time:.2f} s")
# playsound("test.mp3")

# -- hugging face (offline)
device = "cuda" if torch.cuda.is_available() else "cpu"

# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}

def save_text_to_speech(text, speaker="rms", output_filename=None):

    if output_filename is None:
        output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"

    # preprocess text
    inputs = processor(text=text, return_tensors="pt").to(device)
    if speaker is not None:
        # load xvector containing speaker's voice characteristics from a dataset
        speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
    else:
        # random vector, meaning a random voice
        speaker_embeddings = torch.randn((1, 512)).to(device)
    # generate speech with the models
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # save the generated speech to a file with 16KHz sampling rate
    sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)

    # return the filename for reference
    return output_filename


# generate speech with a US female voice
start = time.time()
for _ in range(n_trials):
    save_text_to_speech(text, speaker=speakers["bdl"], output_filename="huggingface.mp3")
avg_time = (time.time() - start) / n_trials
print(f"Average time for huggingface: {avg_time:.2f} s")

# playsound(output_file)

## Agents: ChatGPT + LangChain

In [24]:
# langchain: agents that can search the web
llm_model = "gpt-3.5-turbo"

# create an instance of ChatGPT
llm = ChatOpenAI(temperature=0, model=llm_model, openai_api_key=mykey)

# parse useful tools, eg, Wikipedia, that the "agent" can use
tools = load_tools(["llm-math","wikipedia"], llm=llm)

agent= initialize_agent(
    tools, 
    llm, 
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    verbose = True)

In [25]:
agent("What is the 25% of 300?") # look how cool is the behind the scenes to build new tools...



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI can use the calculator tool to find the answer to this question.

Action:
```json
{
  "action": "Calculator",
  "action_input": "25% * 300"
}
```[0m
Observation: [36;1m[1;3mAnswer: 75.0[0m
Thought:[32;1m[1;3mThe calculator tool returned the answer 75.0.
Final Answer: 75.0[0m

[1m> Finished chain.[0m


{'input': 'What is the 25% of 300?', 'output': '75.0'}

In [26]:
question = "Tom M. Mitchell is an American computer scientist \
and the Founders University Professor at Carnegie Mellon University (CMU)\
what book did he write?"
result = agent(question) 



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I can use Wikipedia to find out what book Tom M. Mitchell wrote.
Action:
```json
{
  "action": "Wikipedia",
  "action_input": "Tom M. Mitchell"
}
```[0m
Observation: [33;1m[1;3mPage: Tom M. Mitchell
Summary: Tom Michael Mitchell (born August 9, 1951) is an American computer scientist and the Founders University Professor at Carnegie Mellon University (CMU). He is a founder and former Chair of the Machine Learning Department at CMU. Mitchell is known for his contributions to the advancement of machine learning, artificial intelligence, and cognitive neuroscience and is the author of the textbook Machine Learning. He is a member of the United States National Academy of Engineering since 2010. He is also a Fellow of the American Academy of Arts and Sciences, the American Association for the Advancement of Science and a Fellow and past President of the Association for the Advancement of Artificial Intelligence. In Oct

In [27]:
# make an agent that can write and EXECUTE inside it freshly-written, dynamic Python code!
agent = create_python_agent(
    llm,
    tool=PythonREPLTool(),
    verbose=True
)

customer_list = [["Harrison", "Chase"], 
                 ["Lang", "Chain"],
                 ["Dolly", "Too"],
                 ["Elle", "Elem"], 
                 ["Geoff","Fusion"], 
                 ["Trance","Former"],
                 ["Jen","Ayai"]
                ]

agent.run(f"""Sort these customers by \
last name and then first name \
and print the output: {customer_list}""") 



[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3mI can use the `sorted()` function to sort the list of customers. I will need to provide a key function that specifies the sorting order based on last name and then first name.
Action: Python_REPL
Action Input: sorted([['Harrison', 'Chase'], ['Lang', 'Chain'], ['Dolly', 'Too'], ['Elle', 'Elem'], ['Geoff', 'Fusion'], ['Trance', 'Former'], ['Jen', 'Ayai']], key=lambda x: (x[1], x[0]))[0m
Observation: [36;1m[1;3m[0m
Thought:[32;1m[1;3mThe customers have been sorted by last name and then first name.
Final Answer: [['Jen', 'Ayai'], ['Harrison', 'Chase'], ['Lang', 'Chain'], ['Elle', 'Elem'], ['Geoff', 'Fusion'], ['Trance', 'Former'], ['Dolly', 'Too']][0m

[1m> Finished chain.[0m


"[['Jen', 'Ayai'], ['Harrison', 'Chase'], ['Lang', 'Chain'], ['Elle', 'Elem'], ['Geoff', 'Fusion'], ['Trance', 'Former'], ['Dolly', 'Too']]"

In [28]:
# inspect the full behind the scenes
langchain.debug=True
agent.run(f"""Sort these customers by \
last name and then first name \
and print the output: {customer_list}""") 
langchain.debug=False

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "Sort these customers by last name and then first name and print the output: [['Harrison', 'Chase'], ['Lang', 'Chain'], ['Dolly', 'Too'], ['Elle', 'Elem'], ['Geoff', 'Fusion'], ['Trance', 'Former'], ['Jen', 'Ayai']]"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:LLMChain] Entering Chain run with input:
[0m{
  "input": "Sort these customers by last name and then first name and print the output: [['Harrison', 'Chase'], ['Lang', 'Chain'], ['Dolly', 'Too'], ['Elle', 'Elem'], ['Geoff', 'Fusion'], ['Trance', 'Former'], ['Jen', 'Ayai']]",
  "agent_scratchpad": "",
  "stop": [
    "\nObservation:",
    "\n\tObservation:"
  ]
}
[32;1m[1;3m[llm/start][0m [1m[1:chain:AgentExecutor > 2:chain:LLMChain > 3:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: You are an agent designed to write and execute python code to answer questions.\nYou

In [None]:
# building langchain agents/tools with prompting

@tool
def time(text: str) -> str:
    """Returns todays date, use this for any \
    questions related to knowing todays date. \
    The input should always be an empty string, \
    and this function will always return todays \
    date - any date mathmatics should occur \
    outside this function."""
    return str(date.today())

agent= initialize_agent(
    tools + [time], 
    llm, 
    agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    verbose = True)

try:
    result = agent("whats the date today?") 
except: 
    print("exception on external access")

## Using directly the ChatGPT API + web scraping

In [None]:
# chatgpt + scraping 
today = datetime.date.today() - datetime.timedelta(days=3) # exmaple: for biorxiv, three days ago, as DOI updates on biorxiv with about 3 days of delay
print(today)

# make a scraping url adding the variable info, here the date
url = str("https://www.biorxiv.org/search/jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A"+str(today)+"%20limit_to%3A"+str(today)+"%20numresults%3A1000%20sort%3Arelevance-rank%20format_result%3Astandard")

# scrape with requests
response = requests.get(url)
if response.status_code == 200:
    html_content = response.content
else:
    print("Error retrieving HTML content:", response.status_code)

# parse the HTML using Beautiful Soup
soup = BeautifulSoup(html_content, "html.parser")
print(soup)

2023-11-02
<!DOCTYPE html>

<html dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:mml="http://www.w3.org/1998/Math/MathML">
<head prefix="og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book#">
<!--[if IE]><![endif]-->
<link href="//d33xdlntwy0kbs.cloudfront.net" rel="dns-prefetch"/>
<link href="//www.googletagmanager.com" rel="dns-prefetch"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://www.biorxiv.org/sites/default/files/images/favicon.ico" rel="shortcut icon" type="image/vnd.microsoft.icon"/>
<meta content="bioRxiv - the preprint server for biology, operated by Cold Spring Harbor Laboratory, a research and educational institution" name="description"/>
<meta content="Drupal 7 (http://drupal.org)" name="generator"/>
<link href="https://www.biorxiv.org/search/jcode%3Amedrxiv%7C%7Cbiorxiv%20limit_from%3A2023-11-02%20limit_to%3A2

In [9]:
# find all DOI links in the HTML using a regular expression - of course ChatGPT made the regex for me
doi_regex = re.compile(r"https?://doi\.org/[^\s]+")
doi_links = soup.find_all(text=doi_regex)

doi_links

  doi_links = soup.find_all(text=doi_regex)


[' https://doi.org/10.1101/2023.11.02.565322 ',
 ' https://doi.org/10.1101/2023.11.02.565291 ',
 ' https://doi.org/10.1101/2023.11.01.564710 ',
 ' https://doi.org/10.1101/2023.11.01.565091 ',
 ' https://doi.org/10.1101/2023.11.02.565001 ',
 ' https://doi.org/10.1101/2023.11.01.564642 ',
 ' https://doi.org/10.1101/2023.11.02.565290 ',
 ' https://doi.org/10.1101/2023.11.01.564448 ',
 ' https://doi.org/10.1101/2023.11.02.565255 ',
 ' https://doi.org/10.1101/2023.11.01.565092 ',
 ' https://doi.org/10.1101/2023.11.01.564594 ',
 ' https://doi.org/10.1101/2023.11.02.565282 ',
 ' https://doi.org/10.1101/2023.11.01.564449 ',
 ' https://doi.org/10.1101/2023.10.31.564789 ',
 ' https://doi.org/10.1101/2023.11.01.564627 ',
 ' https://doi.org/10.1101/2023.11.02.562514 ',
 ' https://doi.org/10.1101/2023.11.02.564696 ',
 ' https://doi.org/10.1101/2023.11.01.564623 ',
 ' https://doi.org/10.1101/2023.11.02.564550 ',
 ' https://doi.org/10.1101/2023.11.01.564270 ',
 ' https://doi.org/10.1101/2023.11.01.56

In [10]:
def generate_response(prompt):
    messag=[{"role": "system", "content": "You are a chatbot"}]
    
    ## build a chat history: you can CONDITION the bot on the style of replies you want to see - also getting weird behaviors... such as KanyeGPT
    history_bot = ["Yes, I'm ready! Please provide the first paper abstract."]
    
    # ask ChatGPT to return STRUCTURED, parsable answers that you can extract easily - often better providing examples of desired behavior (1-2 example often enough)
    history_user = ["i'll give you some paper abstracts. for each abstract (i.e., for each of my messages), you will a) assign a topic from the following list:\nbiochemistry\nbiophysics\nproteomics\ncancer\ncell biology\nmolecular and synthetic biology\ncomputational biology\ngenetics and genomics\npathology\nimmunology\nmicrobiology\nneuroscience\ndevelopmental biology\nethology and behavior\nzoology\nplant biology\nindustrial biotechnology\npharmacology\nengineering\nvirology\nmachine learning\nchemical biology\nnanomedicine\naging\necology and evolution\nvaccinology\nepidemiology\nclinical trials,\nb) write a 2-sentences summary, focusing on the key innovation presented in that abstract.\n\nfor example:\nmy input = The spontaneous deamination of cytosine is a major source of transitions from C•G to T•A base pairs, which account for half of known pathogenic point mutations in humans. The ability to efficiently convert targeted A•T base pairs to G•C could therefore advance the study and treatment of genetic diseases. The deamination of adenine yields inosine, which is treated as guanine by polymerases, but no enzymes are known to deaminate adenine in DNA. Here we describe adenine base editors (ABEs) that mediate the conversion of A•T to G•C in genomic DNA. We evolved a transfer RNA adenosine deaminase to operate on DNA when fused to a catalytically impaired CRISPR–Cas9 mutant. Extensive directed evolution and protein engineering resulted in seventh-generation ABEs that convert targeted A•T base pairs efficiently to G•C (approximately 50% efficiency in human cells) with high product purity (typically at least 99.9%) and low rates of indels (typically no more than 0.1%). ABEs introduce point mutations more efficiently and cleanly, and with less off-target genome modification, than a current Cas9 nuclease-based method, and can install disease-correcting or disease-suppressing mutations in human cells. Together with previous base editors, ABEs enable the direct, programmable introduction of all four transition mutations without double-stranded DNA cleavage.\n\nyour output =\na. genetics and genomics\nb. A new base-editor that converts A-T to G-C, based on an RNA adenosine deaminase fused to catalitically-impaired CRISPR-Cas9. Base editors can install therapeutic mutations in genomic DNA in human cells with no double-strand break.\nready to start?"]
    
    for user_message, bot_message in zip(history_user, history_bot):
        messag.append({"role": "user", "content": str(user_message)})
        messag.append({"role": "system", "content": str(bot_message)})
    messag.append({"role": "user", "content": str(prompt)})

    response = openai.ChatCompletion.create(
        
    # please use gtp3.5 although gpt4 is much better for $$
    model="gpt-3.5-turbo",
        messages=messag
    )
    result = ''
    for choice in response.choices:
        result += choice.message.content
    history_bot.append(result)
    history_user.append(str(prompt))
    return result

print(generate_response("The power of human language and thought arises from systematic compositionality—the algebraic ability to understand and produce novel combinations from known components. Fodor and Pylyshyn1 famously argued that artificial neural networks lack this capacity and are therefore not viable models of the mind. Neural networks have advanced considerably in the years since, yet the systematicity challenge persists. Here we successfully address Fodor and Pylyshyn’s challenge by providing evidence that neural networks can achieve human-like systematicity when optimized for their compositional skills. To do so, we introduce the meta-learning for compositionality (MLC) approach for guiding training through a dynamic stream of compositional tasks. To compare humans and machines, we conducted human behavioural experiments using an instruction learning paradigm. After considering seven different models, we found that, in contrast to perfectly systematic but rigid probabilistic symbolic models, and perfectly flexible but unsystematic neural networks, only MLC achieves both the systematicity and flexibility needed for human-like generalization. MLC also advances the compositional skills of machine learning systems in several systematic generalization benchmarks. Our results show how a standard neural network architecture, optimized for its compositional skills, can mimic human systematic generalization in a head-to-head comparison."))

a. machine learning
b. The authors address the challenge of systematicity in artificial neural networks by introducing the meta-learning for compositionality (MLC) approach, which guides training through a dynamic stream of compositional tasks. They demonstrate that MLC achieves both systematicity and flexibility, enabling neural networks to mimic human systematic generalization.


In [None]:
# trivial web scraping example - note, it's slow

titles = []
abstracts = []

for link in doi_links[0:3]:
    print(link)
    response = requests.get(link)
    
    if response.status_code == 200:
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")

        # find the div element with class="section abstract" and id="abstract-1"
        abstract_div = soup.find("div", {"class": "section abstract", "id": "abstract-1"})

        # extract the text from the first p element inside the abstract_div
        abstract_text = abstract_div.find("p").text.strip()

        # clean up the extracted text
        abstract_text = abstract_text.strip()
        abstract_text = abstract_text.replace("\n", " ")

        div_tag = soup.find('div', {'class': 'highwire-cite highwire-cite-highwire-article highwire-citation-biorxiv-article-pap-list clearfix'})
        title = div_tag.find('div', {'class': 'highwire-cite-title'}).text.strip()
        
        titles.append(title)
        abstracts.append(abstract_text)
        
    else:
        print("Error retrieving HTML content:", response.status_code)
        
print(titles)

In [21]:
import openai
import json

# Example dummy function hard coded to return the same weather
# In production, this could be your backend API or an external API
def get_current_weather(location, unit="fahrenheit"):
    """Get the current weather in a given location"""
    weather_info = {
        "location": location,
        "temperature": "72",
        "unit": unit,
        "forecast": ["sunny", "windy"],
    }
    return json.dumps(weather_info)

def run_conversation():
    # Step 1: send the conversation and available functions to GPT
    messages = [{"role": "user", "content": "What's the weather like in Boston?"}]
    functions = [
        {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        }
    ]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0613",
        messages=messages,
        functions=functions,
        function_call="auto",  # auto is default, but we'll be explicit
    )
    response_message = response["choices"][0]["message"]

    # Step 2: check if GPT wanted to call a function
    if response_message.get("function_call"):
        # Step 3: call the function
        # Note: the JSON response may not always be valid; be sure to handle errors
        available_functions = {
            "get_current_weather": get_current_weather,
        }  # only one function in this example, but you can have multiple
        function_name = response_message["function_call"]["name"]
        function_to_call = available_functions[function_name]
        function_args = json.loads(response_message["function_call"]["arguments"])
        function_response = function_to_call(
            location=function_args.get("location"),
            unit=function_args.get("unit"),
        )

        # Step 4: send the info on the function call and function response to GPT
        messages.append(response_message)  # extend conversation with assistant's reply
        messages.append(
            {
                "role": "function",
                "name": function_name,
                "content": function_response,
            }
        )  # extend conversation with function response
        second_response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-0613",
            messages=messages,
        )  # get a new response from GPT where it can see the function response
        return second_response

print(run_conversation())

{
  "id": "chatcmpl-8HYet9s6cmx4kWIaKM6ARrMrPrTh3",
  "object": "chat.completion",
  "created": 1699195495,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "The current weather in Boston is sunny and windy with a temperature of 72 degrees."
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 72,
    "completion_tokens": 17,
    "total_tokens": 89
  }
}


## Structuring ChatGPT's "reasoning chain"

In [31]:
# "chain-of-thought" prompting: structuring LLM's activity to resemble human reasoning at structured tasks involving language
# note how you can use prompting creatively

def get_completion_from_messages(messages, 
                                 model="gpt-3.5-turbo", 
                                 temperature=0, max_tokens=500):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, 
        max_tokens=max_tokens, 
    )
    return response.choices[0].message["content"]

delimiter = "####"
system_message = f"""
Follow these steps to answer the customer queries.
The customer query will be delimited with four hashtags,\
i.e. {delimiter}. 

Step 1:{delimiter} First decide whether the user is \
asking a question about a specific product or products. \
Product cateogry doesn't count. 

Step 2:{delimiter} If the user is asking about \
specific products, identify whether \
the products are in the following list.
All available products: 
1. Product: TechPro Ultrabook
   Category: Computers and Laptops
   Brand: TechPro
   Model Number: TP-UB100
   Warranty: 1 year
   Rating: 4.5
   Features: 13.3-inch display, 8GB RAM, 256GB SSD, Intel Core i5 processor
   Description: A sleek and lightweight ultrabook for everyday use.
   Price: $799.99

2. Product: BlueWave Gaming Laptop
   Category: Computers and Laptops
   Brand: BlueWave
   Model Number: BW-GL200
   Warranty: 2 years
   Rating: 4.7
   Features: 15.6-inch display, 16GB RAM, 512GB SSD, NVIDIA GeForce RTX 3060
   Description: A high-performance gaming laptop for an immersive experience.
   Price: $1199.99

3. Product: PowerLite Convertible
   Category: Computers and Laptops
   Brand: PowerLite
   Model Number: PL-CV300
   Warranty: 1 year
   Rating: 4.3
   Features: 14-inch touchscreen, 8GB RAM, 256GB SSD, 360-degree hinge
   Description: A versatile convertible laptop with a responsive touchscreen.
   Price: $699.99

4. Product: TechPro Desktop
   Category: Computers and Laptops
   Brand: TechPro
   Model Number: TP-DT500
   Warranty: 1 year
   Rating: 4.4
   Features: Intel Core i7 processor, 16GB RAM, 1TB HDD, NVIDIA GeForce GTX 1660
   Description: A powerful desktop computer for work and play.
   Price: $999.99

5. Product: BlueWave Chromebook
   Category: Computers and Laptops
   Brand: BlueWave
   Model Number: BW-CB100
   Warranty: 1 year
   Rating: 4.1
   Features: 11.6-inch display, 4GB RAM, 32GB eMMC, Chrome OS
   Description: A compact and affordable Chromebook for everyday tasks.
   Price: $249.99

Step 3:{delimiter} If the message contains products \
in the list above, list any assumptions that the \
user is making in their \
message e.g. that Laptop X is bigger than \
Laptop Y, or that Laptop Z has a 2 year warranty.

Step 4:{delimiter}: If the user made any assumptions, \
figure out whether the assumption is true based on your \
product information. 

Step 5:{delimiter}: First, politely correct the \
customer's incorrect assumptions if applicable. \
Only mention or reference products in the list of \
5 available products, as these are the only 5 \
products that the store sells. \
Answer the customer in a friendly tone.

Use the following format:
Step 1:{delimiter} <step 1 reasoning>
Step 2:{delimiter} <step 2 reasoning>
Step 3:{delimiter} <step 3 reasoning>
Step 4:{delimiter} <step 4 reasoning>
Response to user:{delimiter} <response to customer>

Make sure to include {delimiter} to separate every step.
"""

# again note the steps the "chain"
user_message = f"""
by how much is the BlueWave Chromebook more expensive \
than the TechPro Desktop"""

messages =  [  
{'role':'system', 
 'content': system_message},    
{'role':'user', 
 'content': f"{delimiter}{user_message}{delimiter}"},  
] 

response = get_completion_from_messages(messages)
print(response)

Step 1:#### The user is asking about the price difference between the BlueWave Chromebook and the TechPro Desktop.

Step 2:#### Both the BlueWave Chromebook and the TechPro Desktop are available products.

Step 3:#### The user assumes that the BlueWave Chromebook is more expensive than the TechPro Desktop.

Step 4:#### Based on the product information, the price of the BlueWave Chromebook is $249.99, and the price of the TechPro Desktop is $999.99. Therefore, the TechPro Desktop is actually more expensive than the BlueWave Chromebook.

Response to user:#### The BlueWave Chromebook is actually less expensive than the TechPro Desktop. The BlueWave Chromebook is priced at $249.99, while the TechPro Desktop is priced at $999.99.


## OpenAI vector embeddings

In [10]:
# ChatGPT vector embeddings

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

# load & inspect dataset
input_datapath = "Reviews.csv" 
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [11]:
# subsample to 1k most recent reviews and remove samples that are too long
top_n = 1000
df = df.sort_values("Time").tail(top_n * 2)  # first cut to first 2k entries, assuming less than half will be filtered out
df.drop("Time", axis=1, inplace=True)

encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)

df = df.iloc[1:20,]
df

Unnamed: 0_level_0,ProductId,UserId,Score,Summary,Text,combined,n_tokens
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
220697,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...,35
107908,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...,"Title: It isn't blanc mange, but isn't bad . ....",267
107800,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...,Title: These also have SALT and it's not sea s...,239
205313,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...,Title: Happy with the product; Content: My dog...,86
284450,B008PSM0BQ,A3OUFIMGL2K6RS,4,Good Sauce,This is a good all purpose sauce. Has good fl...,Title: Good Sauce; Content: This is a good all...,107
378704,B008YA1LQK,A9YEAAQVHFUTX,5,Blackcat,Great coffee! Love all Green Mountain coffee ...,Title: Blackcat; Content: Great coffee! Love ...,34
456909,B001KP6B98,ABWCUS3HBDZRS,5,Excellent product,After scouring every store in town for orange ...,Title: Excellent product; Content: After scour...,100
378703,B008YA1LQK,A2RSB6FVQ9K9OD,5,Bulk k-Cups,This is the best way to buy coffee for my offi...,Title: Bulk k-Cups; Content: This is the best ...,44
312410,B001E5E2QI,A23WYVBCNE75X1,3,It's Okay,"Next time, I will buy Gevalia Irish Cream deca...","Title: It's Okay; Content: Next time, I will b...",38
65135,B000H9K4KA,A2WMZYYF8OSGD6,5,FABULOUS...,Absolutely wonderful. A real licorice taste. ...,Title: FABULOUS...; Content: Absolutely wonder...,46


In [12]:
# this may take a few minutes
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, engine=embedding_model))

OSError: Cannot save file into a non-existent directory: 'data'

In [13]:
df.to_csv("fine_food_reviews_with_embeddings_1k.csv")

df

Unnamed: 0_level_0,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
220697,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...,35,"[-0.023609420284628868, -0.011784634552896023,..."
107908,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...,"Title: It isn't blanc mange, but isn't bad . ....",267,"[0.00016697357932571322, 0.005226491950452328,..."
107800,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...,Title: These also have SALT and it's not sea s...,239,"[0.010532955639064312, -0.01354704238474369, 0..."
205313,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...,Title: Happy with the product; Content: My dog...,86,"[0.015255776233971119, -0.003898625960573554, ..."
284450,B008PSM0BQ,A3OUFIMGL2K6RS,4,Good Sauce,This is a good all purpose sauce. Has good fl...,Title: Good Sauce; Content: This is a good all...,107,"[-0.016170555725693703, 0.005940747447311878, ..."
378704,B008YA1LQK,A9YEAAQVHFUTX,5,Blackcat,Great coffee! Love all Green Mountain coffee ...,Title: Blackcat; Content: Great coffee! Love ...,34,"[-0.014104718342423439, -0.01621335558593273, ..."
456909,B001KP6B98,ABWCUS3HBDZRS,5,Excellent product,After scouring every store in town for orange ...,Title: Excellent product; Content: After scour...,100,"[0.0006724869017489254, -0.017008278518915176,..."
378703,B008YA1LQK,A2RSB6FVQ9K9OD,5,Bulk k-Cups,This is the best way to buy coffee for my offi...,Title: Bulk k-Cups; Content: This is the best ...,44,"[-0.0022770462092012167, -0.000179590948391705..."
312410,B001E5E2QI,A23WYVBCNE75X1,3,It's Okay,"Next time, I will buy Gevalia Irish Cream deca...","Title: It's Okay; Content: Next time, I will b...",38,"[-0.01740219257771969, -0.012602492235600948, ..."
65135,B000H9K4KA,A2WMZYYF8OSGD6,5,FABULOUS...,Absolutely wonderful. A real licorice taste. ...,Title: FABULOUS...; Content: Absolutely wonder...,46,"[-0.010806039907038212, -0.004772723652422428,..."


In [17]:
# how long are the review embeddings?
# we could now treat each review as a 1536-features element of a vector space to do clusterings etc!!
# much more here: https://platform.openai.com/docs/guides/embeddings/use-cases

print(len(df["embedding"].iloc[1,]))
print(len(df["embedding"].iloc[17,]))

1536
1536


## Other APIs, eg DALL-E

In [19]:
# wanna really get a cliché image generated right here in the flow of your python script?
response = openai.Image.create(
  prompt="a white siamese cat",
  n=1,
  size="1024x1024"
)
image_url = response['data'][0]['url']

In [20]:
image_url # https://platform.openai.com/docs/guides/images/introduction

'https://oaidalleapiprodscus.blob.core.windows.net/private/org-UaXPn01jC1g08Lnzb75nWrEm/user-j1qylkAsO6LU9a6AFq3cuW0J/img-YTI8YXftElpbzYy3cukfOV7J.png?st=2023-11-05T13%3A42%3A38Z&se=2023-11-05T15%3A42%3A38Z&sp=r&sv=2021-08-06&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2023-11-05T04%3A12%3A09Z&ske=2023-11-06T04%3A12%3A09Z&sks=b&skv=2021-08-06&sig=XcyMcRM%2BgECUo0tJX2vgOms8guK/E0sbLXGqhWHlTO8%3D'

# Search over pdf documents with LangChain + others

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

#loader = UnstructuredPDFLoader("field-guide-to-data-science.pdf")
loader = OnlinePDFLoader("https://www.biorxiv.org/content/10.1101/2023.07.20.549945v1.full.pdf")

data = loader.load_and_split() 

print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

print (f'Now you have {len(texts)} documents')

OPENAI_API_KEY = mykey

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

import pinecone      

pinecone.init(      
	api_key='3e493adc-1c84-4bf2-b034-b4a446cd0756',      
	environment='gcp-starter'      
)      
index = pinecone.Index('langchain')

index_name = "langchain"  ####### I0VE STARTED A VERY RANDOM INDEX ON FREE TIER IN PINECONE!!

docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

query = "what is Voyager?"
docs = docsearch.similarity_search(query)#, include_metadata=True)

print(docs)

In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY) ### temperature parameter controls the "creativity" (aka, hallucinations)
chain = load_qa_chain(llm, chain_type="stuff")

query = "summarize the technical improvements in Voyager"
docs = docsearch.similarity_search(query)#, include_metadata=True)

chain.run(input_documents=docs, question=query)

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

import os
os.environ["OPENAI_API_KEY"] = "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" #"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"
os.environ["SERPAPI_API_KEY"] = "0e6f1809e32dda8c35a1a0c58893c63b8083170921c8f36262ce0cf85658eae6"

# provide the path of  pdf file/files.
pdfreader = PdfReader('Document sans titre.pdf')

from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content
        
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

document_search = FAISS.from_texts(texts, embeddings)

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

chain = load_qa_chain(OpenAI(), chain_type="stuff")

query = "networks?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

query = "How much the agriculture target will be increased to and what the focus will be"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

from langchain.document_loaders import OnlinePDFLoader

loader = OnlinePDFLoader("https://arxiv.org/pdf/1706.03762.pdf")

data = loader.load()

# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

query = "Explain me about Attention is all you need"
index.query(query)