# Milestone 2 Model
  * Please use GPUs to run this in an shorter time frame. It may take an hour or more to run a few of our cells otherwise

## Imports

In [None]:
!pip install poetry
!pip install langchain
!pip install sentence_transformers
!pip install openai
!pip install pypdf
!poetry config virtualenvs.in-project true
!poetry install --no-ansi

In [None]:
import os, sys

VENV_PATH = "/content/gdrive/MyDrive/test-poetry/.venv/lib/python3.8/site-packages"
LOCAL_VENV_PATH = '/content/venv' # local notebook
os.symlink(VENV_PATH, LOCAL_VENV_PATH) # connect to directory in drive
sys.path.insert(0, LOCAL_VENV_PATH)

In [None]:
from langchain.storage import InMemoryStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import DirectoryLoader
from langchain.retrievers.multi_vector import MultiVectorRetriever
from sentence_transformers import SentenceTransformer, util
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader, UnstructuredFileLoader, TextLoader, Docx2txtLoader, PyPDFDirectoryLoader, PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.prompts import PromptTemplate
from openai import OpenAI
from langchain.chat_models import ChatOpenAI

import textwrap
import nltk
import os
import pandas as pd
import glob

## Data + Model Set-up

In [None]:
df = pd.read_excel('530_project_test_dev.xlsx')
mylist = df['Test/Dev Input'].tolist()

In [None]:
NUMBER_OF_RESULTS = 10
SEARCH_DISTANCE_THRESHOLD = 0.6
OPENAI_API_KEY = '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'
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo-16k",
    temperature=1
)

In [None]:
store = InMemoryStore()

embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/msmarco-bert-base-dot-v5')

In [None]:
!unzip TrainingDataSmall.zip

Archive:  TrainingDataSmall.zip
replace TrainingDataSmall/Weijie Su_document.pdf? [y]es, [n]o, [A]ll, [N]one, [r]ename: yes
  inflating: TrainingDataSmall/Weijie Su_document.pdf  
replace TrainingDataSmall/Weijie Su_2307.02792.pdf? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
%%time
loader = PyPDFDirectoryLoader("TrainingDataSmall")
docs = loader.load()
docs

In [None]:
%%time
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100000,
                                               chunk_overlap=50,
                                               separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], )
all_splits = text_splitter.split_documents(docs)

CPU times: user 235 ms, sys: 994 µs, total: 236 ms
Wall time: 237 ms


In [None]:
!pip install chromadb

In [None]:
%%time
vectorstore = Chroma(
    collection_name="professor-data",
    embedding_function=embeddings
)

CPU times: user 615 ms, sys: 23.8 ms, total: 638 ms
Wall time: 730 ms


In [None]:
%%time
store = InMemoryStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
     vectorstore=vectorstore,
     docstore=store,
     id_key=id_key,
      search_type="similarity",
      search_kwargs={
          "k": NUMBER_OF_RESULTS,
          "search_distance": SEARCH_DISTANCE_THRESHOLD,
      })

CPU times: user 151 µs, sys: 0 ns, total: 151 µs
Wall time: 156 µs


In [None]:
%%time
import uuid

doc_ids = [str(uuid.uuid4()) for _ in all_splits]

for i, doc in enumerate(all_splits):
    _id = doc_ids[i]
    doc.metadata[id_key] = _id

CPU times: user 15.7 ms, sys: 28 µs, total: 15.7 ms
Wall time: 14.4 ms


In [None]:
%%time
retriever.vectorstore.add_documents(all_splits)
retriever.docstore.mset(list(zip(doc_ids, all_splits)))

CPU times: user 1min 33s, sys: 1.49 s, total: 1min 34s
Wall time: 1min 36s


In [None]:
# Links to papers
#https://www.cis.upenn.edu/~mkearns/papers/pricemodel.pdf
#https://arxiv.org/pdf/2211.11158.pdf
#https://link.springer.com/content/pdf/10.1023/A:1017984413808.pdf

In [None]:
template = """
Create a list of features, concepts, and related topics from each papers that can be used for machine learning

Question: {question}

This is an example of what you have to do:
Concept Bottleneck Models (CBM) are inherently interpretable models that factor model decisions into humanreadable concepts. They allow people to easily understand
why a model is failing, a critical feature for high-stakes applications. CBMs require manually specified concepts and
often under-perform their black box counterparts, preventing
their broad adoption. We address these shortcomings and
are first to show how to construct high-performance CBMs
without manual specification of similar accuracy to black
box models. Our approach, Language Guided Bottlenecks
(LaBo), leverages a language model, GPT-3, to define a
large space of possible bottlenecks. Given a problem domain,
LaBo uses GPT-3 to produce factual sentences about categories to form candidate concepts. LaBo efficiently searches
possible bottlenecks through a novel submodular utility that
promotes the selection of discriminative and diverse information. Ultimately, GPT-3’s sentential concepts can be aligned
to images using CLIP, to form a bottleneck layer. Experiments demonstrate that LaBo is a highly effective prior for
concepts important to visual recognition. In the evaluation
with 11 diverse datasets, LaBo bottlenecks excel at few-shot
classification: they are 11.7% more accurate than black
box linear probes at 1 shot and comparable with more data.
Overall, LaBo demonstrates that inherently interpretable
models can be widely applied at similar, or better, performance than black box approaches.: [Language Guided Bottlenecks, Concept Bottleneck Models, Machine learning, Data Science, GPT-3, LLMs, Natural Language Processing, Computer Vision, Image Processing],

We examine a Markovian model for the price evolution of a
stock, in which the probability of local upward or downward movement
is arbitrarily dependent on the current price itself (and perhaps some auxiliary state information).
This model directly and considerably generalizes many of the most well-studied price evolution models in classical finance, including a variety of random walk, drift and diffusion models.
Our main result is a universally profitable" trading strategy | a single fixed strategy whose profitability competes with the optimal strategy
(which knows all of the underlying parameters of the infinite and possibly
nonstationary Markov process): [Trading, Markovian Model, High Frequency Trading, Pricing Models, Quantitative Finance, Classical Finance, Random Walk, Statistics, Mathematical Finance, Probabilistic Models],

We present new algorithms for reinforcement learning and prove that they have polynomial bounds
on the resources required to achieve near-optimal return in general Markov decision processes. After observing
that the number of actions required to approach the optimal return is lower bounded by the mixing time T of
the optimal policy (in the undiscounted case) or by the horizon time T (in the discounted case), we then give
algorithms requiring a number of actions and total computation time that are only polynomial in T and the number
of states and actions, for both the undiscounted and discounted cases. An interesting aspect of our algorithms is
their explicit handling of the Exploration-Exploitation trade-off.: [Reinforcement Learning, Markov Decision Process, Artificial Intelligence, Algorithmic Run-time],

{context}:"""

In [None]:
%%time
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
    chain_type_kwargs={"prompt": PromptTemplate(
            template=template,
            input_variables=["context"]),
    },
)

qa.combine_documents_chain.verbose = True
qa.combine_documents_chain.llm_chain.verbose = True
qa.combine_documents_chain.llm_chain.llm.verbose = True

CPU times: user 750 µs, sys: 0 ns, total: 750 µs
Wall time: 760 µs


In [None]:
professor_list = [
  'Bong Ho Kim',
 'Junhyong Kim ',
 'Daniel E. Koditschek',
 'Konrad Koering',
 'Lingjie Liu',
 'Adam David Mally',
 'Andre Scedrov',
 'Ryan Marcus',
 'Linh Thi Xuan Phan',
 'Rahul Mangharam',
 'Tal Rabin',
 'Vincent Liu',
 'Jérémie O. Lumbroso',
 'George J. Pappas',
 'Qi Long',
 'Nikolai Matni',
 'Harvey Rubin',
 'Mayur Naik',
 'Joshua B. Plotkin',
 'Dan Roth',
 'Alejandro Ribeiro',
 'Travis Q. McGaha',
 'Mark L. Liberman',
 'Robin Pemantle',
 'Danaë Metaxa',
 'Victor M. Preciado',
 'Benjamin C. Pierce',
 'Aaron Roth',
 'Pratyush Mishra',
 'Shirin Saeedi Bidokhti',
 'Boon Thau Loo',
 'Michael Posa',
 'Surbi Goel',
 'Jacob Gardner',
 'Rajiv Gandhi',
 'Eric Fouh',
 'Gushu Li',
 'Jing Li',
 'Insup Lee',
 'Benjamin Lee',
 'Stephen Lane',
 'Vijay Kumar',
 'Nadia Figueroa',
 'Joe Devietti',
 'Pratik Chaudhari',
 'Damon Centola',
 'Chris Callison-Burch',
 'Susan Davidson',
 'Osbert Bastani',
 'Yoseph Barash',
 'Sharath Chandra Guntuku',
 'Andreas Haeberlen',
 'Daniel Hashimoto',
 'Hamed Hassani',
 'Andrew Head',
 'Brett Hemenway ',
 'Daniel J Hopkins',
 'M. Ani Hsieh ',
 'Zachary Ives',
 'Sanjeev Khanna',
 'Michael Kearns ',
 'Dinesh Jayaraman',
 'Kevin B. Johnson ',
 'Yasmin Kafai ',
 'Sampath K. Kannan ',
 'Sebastian Angel',
 'Rajeev Alur',
 'Shivani Agarwal',
 'James Gee',
 'Jean Gallier',
 'Thomas Farmer',
 'Eric Eaton',
 'Andre DeHon',
 'Anindya De',
 'Kostas Daniilidis',
 'Ryan Baker',
 'Justin Gottschlich',
 'Norman I. Badler',
 'Mingmin Zhao',
 'Val B. Tannen',
 'Mark Yatskar',
 'Mark Yim',
 'Li-San Wang',
 'Christopher S. Yoo',
 'Charles Yang',
 'Camillo Taylor',
 'Cynthia Sung',
 'Duncan Watts',
 'Eric Weingarten',
 'Eric Wong',
 'Harry Smith',
 'Jianbo Shi',
 'Jonathan Smith',
 'Lyle Ungar',
 'Oleg Sokolsky',
 'Renee Vidal',
 'Rakesh Vohra',
 'Scott Weinstein',
 'Stephanie Weirich',
 'Steven Zdancemic',
 'Swapneel Seth',
 'Weijie Su']

In [None]:
%%time
professor_question_list=[]
for item in professor_list:
  string = "What are key features of {}'s work?".format(item)
  professor_question_list.append(string)

CPU times: user 48 µs, sys: 0 ns, total: 48 µs
Wall time: 50.5 µs


## Results
- Use this section to get the results for either the weak or strong baseline

In [None]:
def formatter(result):
    print(f"Query: {result['query']}")
    print("." * 80)
    if "source_documents" in result.keys():
        for idx, ref in enumerate(result["source_documents"]):
            print("-" * 80)
            print(f"REFERENCE #{idx}")
            print("-" * 80)
            if "score" in ref.metadata:
                print(f"Matching Score: {ref.metadata['score']}")
            if "source" in ref.metadata:
                print(f"Document Source: {ref.metadata['source']}")
            if "document_name" in ref.metadata:
                print(f"Document Name: {ref.metadata['document_name']}")
            print("." * 80)
            print(f"Content: \n{wrap(ref.page_content)}")
    print("." * 80)
    print(f"Response: {wrap(result['result'])}")
    print("." * 80)
    return wrap(result['result'])

def wrap(s):
    return "\n".join(textwrap.wrap(s, width=120, break_long_words=False))

def ask(query, qa=qa, k=NUMBER_OF_RESULTS, search_distance=SEARCH_DISTANCE_THRESHOLD):
    qa.retriever.search_kwargs["search_distance"] = search_distance
    qa.retriever.search_kwargs["k"] = k
    result = qa({"query": query})
    return formatter(result)

In [None]:
%%time
#Please note this cell will take between 15 and 20 minutes to run with GPUs
import time
results = []
for question in professor_question_list:
    time.sleep(7)
    print(question)
    results.append(ask(question))

print(results)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
What are key features of Renee Vidal's work?


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Create a list of features, concepts, and related topics from each papers that can be used for machine learning

Question: What are key features of Renee Vidal's work?

This is an example of what you have to do:
Concept Bottleneck Models (CBM) are inherently interpretable models that factor model decisions into humanreadable concepts. They allow people to easily understand
why a model is failing, a critical feature for high-stakes applications. CBMs require manually specified concepts and
often under-perform their black box counterparts, preventing
their broad adoption. We address these shortcomings and
are first to show how to construct high-performance CBMs
without manual specification of similar a

In [None]:
import numpy as np
results_df = pd.DataFrame()

results_df['professor'] = np.array(professor_list)
results_df['features'] = np.array(results)

In [None]:
results_df

Unnamed: 0,professor,features
0,Bong Ho Kim,"In this paper, the authors discuss the concept..."
1,Junhyong Kim,Key features of Junhyong Kim's work: - Machine...
2,Daniel E. Koditschek,- Adaptive Query Processing - Transdiagnostic ...
3,Konrad Koering,- Database Provenance - Diagnostic and Forensi...
4,Lingjie Liu,- Large Language Models (LLMs) - ChatGPT - Dat...
...,...,...
97,Scott Weinstein,From Scott Weinstein's work: - Language Guided...
98,Stephanie Weirich,"Concepts, features, and related topics from St..."
99,Steven Zdancemic,Key Features of Steven Zdancemic's Work: - La...
100,Swapneel Seth,Concept: - Workﬂow provenance - Database-style...


In [None]:
results_df.to_csv('professor_research_results.csv')

## Clean-up

In [None]:
def text_split(x):
  x_new = x.replace('-', ',').replace('1.', ',').replace('2.', ',')\
 .replace('3.', ',').replace('4.', ',').replace('5.', ',').replace('6.', ',')\
 .replace('7.', ',').replace('8.', ',').replace('9.', ',').replace('10.', ',')\
 .replace('11.', ',').replace('12.', ',').replace('13.', ',').replace('14.', ',')\
 .replace('15.', ',').replace('16.', ',').replace('17.', ',').replace('18.', ',')\
 .replace('19.', ',').replace('20.', ',')
  return x_new.split(',')

In [None]:
results_df['features_new'] = results_df['features'].apply(lambda x: text_split(x))

In [None]:
results_df = results_df.explode('features_new')

In [None]:
results_df.drop(columns=['features'], inplace=True)

In [None]:
results_df[150:190]

Unnamed: 0,professor,features_new
8,Linh Thi Xuan Phan,\nEncryption
8,Linh Thi Xuan Phan,Computer Science.
9,Rahul Mangharam,Key features of Rahul Mangharam's work include:
9,Rahul Mangharam,Intersection of formal methods
9,Rahul Mangharam,machine learning
9,Rahul Mangharam,and controls for\nmedical devices
9,Rahul Mangharam,energy efficient buildings
9,Rahul Mangharam,and autonomous systems
9,Rahul Mangharam,Research at the PRECISE Center and direction ...
9,Rahul Mangharam,Recognition and awards


In [None]:
results_df = results_df[results_df.features_new != '']

In [None]:
results_df.to_csv('professor_research_results.csv')