# Install Dependencies
Uncomment if running the notebook for the first time

In [34]:
# !pip install langchain
# !pip install openai

# Import Dependencies

In [35]:
import os
from typing import Dict, List
import json
import random
import gdown

from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chains.base import Chain

# Set OpenAI API Key

In [36]:
open_ai_api_key = "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"
os.environ["OPENAI_API_KEY"] = open_ai_api_key

# Go over the [LangChain](https://langchain.readthedocs.io/en/latest/index.html) documents and figure out how to set temperature for your requests

In [37]:
default_temp = 0.7
llm = OpenAI(model_name="text-davinci-003", 
             temperature=default_temp)

# 4.) Implement problem 3 using LangChain's LLM and the PromptTemplate classes and check if it works.

In [38]:
country_of_origin = "India"
template_1 = "Question: Come up with 10 girl baby names for babies born in {country_of_origin} which start with letter 'A and end with 'I.'\nAnswer: "
prompt_1 = PromptTemplate(template=template_1, 
                          input_variables=["country_of_origin"])
formatted_prompt_1 = prompt_1.format(country_of_origin=country_of_origin)

In [39]:
llm_answer_1 = llm(formatted_prompt_1)
print(f"{formatted_prompt_1}{llm_answer_1}")

Question: Come up with 10 girl baby names for babies born in India which start with letter 'A and end with 'I.'
Answer: 
1. Aashi 
2. Amri 
3. Asmi 
4. Anvi 
5. Arvi 
6. Aarvi 
7. Ayushi 
8. Aditi 
9. Aashiqi 
10. Aasri


# 5.) Create a new prompt that takes a baby name and the country of origin, and comes up with a short (made up) biography. 

In [40]:
baby_name = "Kunal"
country_of_origin = "India"
template_2 = "Question: Come up with a short (made up) biography for a baby named {baby_name}, born in {country_of_origin}.\nAnswer: "
prompt_2 = PromptTemplate(template=template_2, 
                         input_variables=["country_of_origin", "baby_name"])
formatted_prompt_2 = prompt_2.format(baby_name=baby_name, 
                                     country_of_origin=country_of_origin)

In [41]:
llm_answer_2 = llm(formatted_prompt_2)
print(f"{formatted_prompt_2}{llm_answer_2}")

Question: Come up with a short (made up) biography for a baby named Kunal, born in India.
Answer: 
Kunal was born in India to loving parents who were excited to welcome him into the world. Growing up, Kunal was a curious and vibrant child, always eager to explore and try new things. He loved the outdoors, playing games with his friends, and learning about his culture. Kunal grew up to be an intelligent and kind-hearted young man, and he continues to explore the world around him with a deep curiosity. He is passionate about making a difference and is always looking for new ways to help those around him.


# 6.) Follow the [LangChain example to create a custom chain class](https://langchain.readthedocs.io/en/latest/modules/chains/getting_started.html#create-a-custom-chain-with-the-chain-class), to create class that returns a list of dicts of {baby_name, biography}. You have to use the prompts you created in 4 and 5.

In [42]:
class CustomChain(Chain):
    # Baby name generator chain
    chain_1: LLMChain
    # Biography generator chain
    chain_2: LLMChain

    @property
    def input_keys(self) -> List[str]:
        return list(self.chain_1.input_keys)

    @property
    def output_keys(self) -> List[str]:
        return ['output']

    def _call(self, inputs: Dict[str, str]):
        results = list()
        # Get a list of baby names from the first chain
        output_1 = self.chain_1.run(inputs)
        output_1 = output_1.strip().splitlines()
        # For each baby name, get a biography from the second chain
        for i in output_1:
            temp = dict()
            baby_name = i.split(' ')[1]
            inputs['baby_name'] = baby_name
            output_2 = self.chain_2.run(inputs).strip()
            temp['baby_name'] = baby_name
            temp['biography'] = output_2
            results.append(temp)
        return {'output': results}
    
chain_1 = LLMChain(llm=llm, prompt=prompt_1)
chain_2 = LLMChain(llm=llm, prompt=prompt_2)

custom_chain = CustomChain(chain_1=chain_1, chain_2=chain_2)
custom_chain_output = custom_chain.run(country_of_origin=country_of_origin)
print(f"Custom Chain Output:\n{custom_chain_output}")

Custom Chain Output:
[{'baby_name': 'Abhiri', 'biography': 'Abhiri is a happy baby born in India to loving parents. She loves to giggle, smile, and explore her world. She loves playing with her stuffed animals and enjoys learning new things. She loves spending time with her family and is always up for a cuddle. Abhiri is a bright and curious little girl who loves life and is full of joy.'}, {'baby_name': 'Adhiti', 'biography': 'Adhiti was born in India to a loving family. She came into the world full of life and ready to explore. As she grows, she is a curious child who loves to learn and ask questions. She is an adventurous soul who loves to explore her surroundings and find new things to do. She is passionate about music and loves to sing and dance. Adhiti is a bright and happy baby who brings joy to all she meets.'}, {'baby_name': 'Ahanti', 'biography': "Ahanti is a newborn baby from India, born into a loving and supportive family. She is a bundle of joy and her parents are delighte

# ArXiv Bulletin
Every day several hundred, if not more, papers appear on ArXiv. For this task you will implement an ArXiv bulletin that gives a list of paper titles and why I should read it in one sentence. Your final output should look something like this (the exact papers will be different):

1. Summarizing Encyclopedic Term Descriptions on the Web <br/>
Why to read: This paper presents a summarization method to produce a single text from multiple descriptions to concisely describe a term from different viewpoints. <br/>
2. Unsupervised Topic Adaptation for Lecture Speech Retrieval <br/>
Why to read: This paper presents a novel approach to improve the quality of a cross-media information retrieval system by adapting acoustic and language models for automatic speech recognition. <br/>

Download a sample of the NLP ArXiv dataset from here. It has metadata for 100 NLP papers as JSON records. For this exercise, you will randomly pick 10 records to show a proof of concept.

In [48]:
gdown.download('https://drive.google.com/uc?id=1_1oPRNSW7QWdlUs-APMV5Y7h6RxU_8gF')
with open('cs.cl.sample100.json') as f:
    data = f.readlines()
parsed = [json.loads(x) for x in data]
sample10 = random.choices(parsed, k=10)
# paper_list is a list of tuples of the form (title, abstract)
paper_list = [(i['title'].strip(), i['abstract'].strip()) for i in sample10]
print(f"Number of Research Papers:\n{len(paper_list)}")

Downloading...
From: https://drive.google.com/uc?id=1_1oPRNSW7QWdlUs-APMV5Y7h6RxU_8gF
To: /Users/kushagraseth/Documents/Repositories/nlp-244-kushagra/quest-2/cs.cl.sample100.json
100%|██████████| 135k/135k [00:00<00:00, 1.89MB/s]

Number of Research Papers:
10





You will have to use LangChain for this problem and come up with a suitable zero-shot prompt for "why". Be creative with your prompts and use the prompting best practices as your guide. You can experiment with a few prompt variations on for a single paper in Playground before you use it in your code. Make sure your code runs at T=0.

In [44]:
new_temp = 0
llm = OpenAI(model_name="text-davinci-003", 
             temperature=new_temp)

In [45]:
for i in paper_list:
    paper_title = i[0]
    abstract = i[1]
    template =  """Research Paper Title: {paper_title}.\nAbstract: {abstract}.\nSummarize in one sentence why should 
    I read this paper as an Natural Language Processing (NLP) student given Research Paper's Title and Abstract: 
    """
    prompt = PromptTemplate(template=template, 
                            input_variables=["paper_title", "abstract"])
    formatted_prompt = prompt.format(paper_title=paper_title, 
                                     abstract=abstract)
    llm_answer = llm(formatted_prompt)
    print(f"{paper_title}.\nWhy to read: {llm_answer}\n")

Effects of Language Modeling on Speech-driven Question Answering.
Why to read: 
This paper presents a novel approach to speech-driven question answering by integrating automatic speech recognition and question answering, and evaluates its performance using a TREC-style evaluation workshop.

Proofing Tools Technology at Neurosoft S.A..
Why to read: 
This paper provides an overview of the development of proofing tools for Modern Greek at Neurosoft S.A., discussing their efficiencies and inefficiencies, as well as improvement ideas and future directions, making it an essential read for NLP students.

Mapping Multilingual Hierarchies Using Relaxation Labeling.
Why to read: 
This paper presents a new and robust approach for automatically constructing a multilingual Lexical Knowledge Base from pre-existing lexical resources, which could be applied to enrich and improve existing NLP databases.

Thematic Annotation: extracting concepts out of documents.
Why to read: 
This paper presents a nove