# Install Dependencies
Uncomment if running the notebook for the first time

In [18]:
# !pip install langchain
# !pip install openai

# Import Dependencies

In [19]:
import os
from typing import Dict, List
import json
import random
import gdown

from langchain import PromptTemplate
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chains.base import Chain

# Set OpenAI API Key

In [20]:
open_ai_api_key = "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"
os.environ["OPENAI_API_KEY"] = open_ai_api_key

# Go over the [LangChain](https://langchain.readthedocs.io/en/latest/index.html) documents and figure out how to set temperature for your requests

In [21]:
default_temp = 0.7
llm = OpenAI(model_name="text-davinci-003", 
             temperature=default_temp)

# 4.) Implement problem 3 using LangChain's LLM and the PromptTemplate classes and check if it works.

In [22]:
country_of_origin = "India"
template_1 = "Question: Come up with 10 girl baby names for babies born in {country_of_origin} which start and end with letter 'A.'\nAnswer: "
prompt_1 = PromptTemplate(template=template_1, 
                          input_variables=["country_of_origin"])
formatted_prompt_1 = prompt_1.format(country_of_origin=country_of_origin)

In [23]:
llm_answer_1 = llm(formatted_prompt_1)
print(f"{formatted_prompt_1}{llm_answer_1}")

Question: Come up with 10 girl baby names for babies born in India which start and end with letter 'A.'
Answer: 
1. Aarna 
2. Aarushi 
3. Aadhya 
4. Aashna 
5. Aayat 
6. Aanya 
7. Aarika 
8. Aayushi 
9. Aamani 
10. Aashi


# 5.) Create a new prompt that takes a baby name and the country of origin, and comes up with a short (made up) biography. 

In [24]:
baby_name = "Kunal"
country_of_origin = "India"
template_2 = "Question: Come up with a short (made up) biography for a baby named {baby_name}, born in {country_of_origin}.\nAnswer: "
prompt_2 = PromptTemplate(template=template_2, 
                         input_variables=["country_of_origin", "baby_name"])
formatted_prompt_2 = prompt_2.format(baby_name=baby_name, 
                                     country_of_origin=country_of_origin)

In [25]:
llm_answer_2 = llm(formatted_prompt_2)
print(f"{formatted_prompt_2}{llm_answer_2}")

Question: Come up with a short (made up) biography for a baby named Kunal, born in India.
Answer: 
Kunal was born in India to loving parents. As a baby, he was full of energy and curiosity, and often spent his days exploring the world around him. He had a natural talent for music, and loved to play the drums and the harmonium. He enjoyed learning and was an avid reader, always eager to find out more about the world around him. As he grew older, Kunal made his mark in the world, becoming a respected leader in his community. He was a devoted family man, and dedicated himself to helping others. Kunal made an impact on the world, and his memory will never be forgotten.


# 6.) Follow the [LangChain example to create a custom chain class](https://langchain.readthedocs.io/en/latest/modules/chains/getting_started.html#create-a-custom-chain-with-the-chain-class), to create class that returns a list of dicts of {baby_name, biography}. You have to use the prompts you created in 4 and 5.

In [26]:
class CustomChain(Chain):
    # Baby name generator chain
    chain_1: LLMChain
    # Biography generator chain
    chain_2: LLMChain

    @property
    def input_keys(self) -> List[str]:
        return list(self.chain_1.input_keys)

    @property
    def output_keys(self) -> List[str]:
        return ['output']

    def _call(self, inputs: Dict[str, str]):
        results = list()
        # Get a list of baby names from the first chain
        output_1 = self.chain_1.run(inputs)
        output_1 = output_1.strip().splitlines()
        # For each baby name, get a biography from the second chain
        for i in output_1:
            temp = dict()
            baby_name = i.split(' ')[1]
            inputs['baby_name'] = baby_name
            output_2 = self.chain_2.run(inputs).strip()
            temp['baby_name'] = baby_name
            temp['biography'] = output_2
            results.append(temp)
        return {'output': results}
    
chain_1 = LLMChain(llm=llm, prompt=prompt_1)
chain_2 = LLMChain(llm=llm, prompt=prompt_2)

custom_chain = CustomChain(chain_1=chain_1, chain_2=chain_2)
custom_chain_output = custom_chain.run(country_of_origin=country_of_origin)
print(f"Custom Chain Output:\n{custom_chain_output}")

Custom Chain Output:
[{'baby_name': 'Alaya', 'biography': 'Alaya was born in India to loving parents who have always nurtured her with a deep appreciation and respect for her heritage. As a child, Alaya was a quick learner, soaking up the culture, music, and language of her homeland. She loved spending time outdoors, exploring the countryside and playing in the sunshine. As she grew, Alaya developed a passion for knowledge, which has served her well in her academic studies. She is a bright and inquisitive young girl, always eager to learn more about the world around her. Alaya is a kind and compassionate soul, with a big heart and a determination to make the world a better place.'}, {'baby_name': 'Aanya', 'biography': 'Aanya was born in India to loving parents. She is the youngest of two siblings, and the family loves to take trips to the beach together. As a baby, Aanya loves to explore and is constantly learning new things. She loves to cuddle up with her parents and listen to storie

# ArXiv Bulletin
Every day several hundred, if not more, papers appear on ArXiv. For this task you will implement an ArXiv bulletin that gives a list of paper titles and why I should read it in one sentence. Your final output should look something like this (the exact papers will be different):

1. Summarizing Encyclopedic Term Descriptions on the Web <br/>
Why to read: This paper presents a summarization method to produce a single text from multiple descriptions to concisely describe a term from different viewpoints. <br/>
2. Unsupervised Topic Adaptation for Lecture Speech Retrieval <br/>
Why to read: This paper presents a novel approach to improve the quality of a cross-media information retrieval system by adapting acoustic and language models for automatic speech recognition. <br/>

Download a sample of the NLP ArXiv dataset from here. It has metadata for 100 NLP papers as JSON records. For this exercise, you will randomly pick 10 records to show a proof of concept.

In [27]:
gdown.download('https://drive.google.com/uc?id=1_1oPRNSW7QWdlUs-APMV5Y7h6RxU_8gF')
with open('cs.cl.sample100.json') as f:
    data = f.readlines()
parsed = [json.loads(x) for x in data]
sample10 = random.choices(parsed, k=10)
paper_list = [(i['title'].strip(), i['abstract'].strip()) for i in sample10]
print(f"Research Paper List:\n{paper_list}")

Downloading...
From: https://drive.google.com/uc?id=1_1oPRNSW7QWdlUs-APMV5Y7h6RxU_8gF
To: /Users/kushagraseth/Documents/Repositories/nlp-244-kushagra/quest-2/cs.cl.sample100.json
100%|██████████| 135k/135k [00:00<00:00, 2.03MB/s]

Research Paper List:
[('Pronoun Resolution in Japanese Sentences Using Surface Expressions and\n  Examples', "In this paper, we present a method of estimating referents of demonstrative\npronouns, personal pronouns, and zero pronouns in Japanese sentences using\nexamples, surface expressions, topics and foci. Unlike conventional work which\nwas semantic markers for semantic constraints, we used examples for semantic\nconstraints and showed in our experiments that examples are as useful as\nsemantic markers. We also propose many new methods for estimating referents of\npronouns. For example, we use the form ``X of Y'' for estimating referents of\ndemonstrative adjectives. In addition to our new methods, we used many\nconventional methods. As a result, experiments using these methods obtained a\nprecision rate of 87% in estimating referents of demonstrative pronouns,\npersonal pronouns, and zero pronouns for training sentences, and obtained a\nprecision rate of 78% for test sentences."),




You will have to use LangChain for this problem and come up with a suitable zero-shot prompt for "why". Be creative with your prompts and use the prompting best practices as your guide. You can experiment with a few prompt variations on for a single paper in Playground before you use it in your code. Make sure your code runs at T=0.

In [28]:
new_temp = 0
llm = OpenAI(model_name="text-davinci-003", 
             temperature=new_temp)

In [30]:
for i in paper_list:
    paper_title = i[0]
    abstract = i[1]
    template =  """Research Paper Title: {paper_title}.\nAbstract: {abstract}.\nSummarize in one sentence why should 
    I read this paper as an Natural Language Processing (NLP) student given Research Paper's Title and Abstract: 
    """
    prompt = PromptTemplate(template=template, 
                            input_variables=["paper_title", "abstract"])
    formatted_prompt = prompt.format(paper_title=paper_title, 
                                     abstract=abstract)
    llm_answer = llm(formatted_prompt)
    print(f"{paper_title}.\nWhy to read: {llm_answer}\n")

Pronoun Resolution in Japanese Sentences Using Surface Expressions and
  Examples.
Why to read: 
This paper presents a method of estimating referents of demonstrative pronouns, personal pronouns, and zero pronouns in Japanese sentences using examples, surface expressions, topics and foci, and achieves a precision rate of 87% for training sentences and 78% for test sentences.

A Formal Framework for Linguistic Annotation.
Why to read: 
This paper provides a formal framework for constructing, maintaining, and searching linguistic annotations, which is essential for NLP students to understand.

A Bootstrap Approach to Automatically Generating Lexical Transfer Rules.
Why to read: 
This paper presents a bootstrap approach to automatically generating lexical transfer rules from word equivalences, which can be used to create LTRs for multi-word, non-compositional word equivalences of any cardinality.

A Bootstrap Approach to Automatically Generating Lexical Transfer Rules.
Why to read: 
This 