In [1]:
import json
import torch
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.core.schema import MetadataMode
from llama_index.finetuning import generate_qa_embedding_pairs, SentenceTransformersFinetuneEngine
from llama_index.llms.openai import OpenAI
from sklearn.model_selection import train_test_split
import gc
import os
import openai

In [2]:
#we have to set up open ai key to use open ai service from llama_index
os.environ["OPENAI_API_KEY"] = "sk-"
openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
from llama_index.core import SimpleDirectoryReader

# This part is using SimpleDirectoryReader to process the data into chunks of sentences
documents = SimpleDirectoryReader(
    input_files=['materials_formatted.md']
).load_data(show_progress=True)

# Shuffle the documents
import random

random.seed(42)
random.shuffle(documents)

Loading files: 100%|██████████| 1/1 [00:00<00:00,  1.92file/s]


In [4]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
#we create chromadb as our database 
# we do not have to redo the data process and just reload the data from db 

db = chromadb.PersistentClient(path="./trainDataBase")
chroma_collection = db.get_or_create_collection("FinlaProjectTrainData")

In [5]:
from llama_index.core import VectorStoreIndex,StorageContext
#then this the index part for data we can get the response and know where the response come from 
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

db_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)


In [6]:
db_index.storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x29c9c2450>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x29f590090>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={}), 'image': <llama_index.core.vector_stores.simple.SimpleVectorStore object at 0x29f7fa190>}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x29ca69a90>)

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
# the question and answer pair is genrate by GPT 4
# since the auto generate service for open ai from llama-index is no longer used. 
# we store all the question to csv file

questions_context_df = pd.read_csv('question_context.csv')
questions_context_df.head()

Unnamed: 0,Question,Content,Type
0,What is the main focus of the course mentioned...,The main focus of the course is on the tools p...,Lecture
1,What is one of the most successful application...,Machine translation is one of the earliest and...,Lecture
2,What fundamental problem does the note identif...,One fundamental problem in building language-l...,Lecture
3,What does the 'distributional hypothesis' sugg...,The distributional hypothesis suggests that th...,Lecture
4,"According to the note, what major challenge do...",Most existing tools work for precious few (usu...,Lecture


In [8]:
questions = questions_context_df['Question'].values

In [9]:
db_engine = db_index.as_query_engine()

In [12]:
response = db_engine.query("who teach NLP?")
print(response)

Hamidreza Mahyar teaches Natural Language Processing (NLP).


In [9]:
print(questions)

['What is the main focus of the course mentioned in the note?'
 'What is one of the most successful applications of Natural Language Processing?'
 'What fundamental problem does the note identify in building language-learning machines?'
 "What does the 'distributional hypothesis' suggest about word meanings?"
 'According to the note, what major challenge do current NLP tools face?'
 "What does the term 'signifier' refer to in the context of the note?"
 'What does the note say about the ability of children to acquire language?'
 "What kind of model does the 'word 2 vec' algorithm represent each word as?"
 'What is a major limitation of human-annotated resources for word representation, according to the note?'
 'What significant insight about word vectors did the GloVe algorithm introduce?'
 'What model is introduced for training word vectors in the notes?'
 'What are the two main classes of methods for finding word embeddings mentioned?'
 'How does GloVe differ from previous word embedd

In [13]:
from llama_index.llms.openai import OpenAI
from llama_index.finetuning.callbacks import OpenAIFineTuningHandler
from llama_index.core.callbacks import CallbackManager
from llama_index.core import ServiceContext

# we set up a callback fucntion 
# this is because we want record all questions and answers when we send query to GPT 4 
# then it will generat jsonl file for use to finetune the model 
finetuning_handler = OpenAIFineTuningHandler()
callback_manager = CallbackManager([finetuning_handler])

gpt_4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4", temperature=0.3),
    context_window=2048, 
    callback_manager=callback_manager,
)


  gpt_4_context = ServiceContext.from_defaults(


In [14]:
from llama_index.core import VectorStoreIndex
# we reload the index data from VectorStoreIndex 

index = VectorStoreIndex.from_vector_store(vector_store, service_context=gpt_4_context)

query_engine = index.as_query_engine(similarity_top_k=2)


In [15]:
#we use question as a query and send it to GPT 4
# the callback manager will record all data 
for question in questions:
    response = query_engine.query(question)

In [20]:
# callback manager write the record to the file
finetuning_handler.save_finetuning_events("finetuning_qa_pairs.jsonl")

Wrote 730 examples to finetuning_qa_pairs.jsonl


In [17]:
from llama_index.finetuning import OpenAIFinetuneEngine
# this is the pretrained model we use to finetune
finetune_engine = OpenAIFinetuneEngine(
    "gpt-3.5-turbo",
    "finetuning_events.jsonl",
)

In [21]:
# the fine tune process
finetune_engine.finetune()

Num examples: 730
First example:
{'role': 'system', 'content': 'You are an AI Teaching Assistant designed to help students with their educational queries. Your goal is to provide accurate, clear, and helpful responses to questions related to coursework, study materials, and educational concepts. Always provide responses based on the information presented in the query without assuming prior knowledge. Please adhere to the following guidelines: 1. Provide direct answers to questions based on the provided context or known information. 2. Do not reference external sources unless specified in the query. 3. Maintain a supportive and educational tone in all interactions. 4. Avoid speculation and ensure your responses are grounded in factual or well-understood educational principles.'}
{'role': 'user', 'content': "Question: How does Hamidreza Mahyarmahyarh's research in Computational Natural Language Processing contribute to the field?"}
{'role': 'assistant', 'content': "Hamidreza Mahyarmahyar

In [22]:
finetune_engine.get_current_job()

FineTuningJob(id='ftjob-A6jPhmnPgVlYK7K1bKTdxLTx', created_at=1711508321, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal::97FTe9Ci', finished_at=1711514817, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-BwtNVnRGHyHMfKEqmdxuL8eJ', result_files=['file-fhztKMzB8jYJBPo570a2Kho3'], status='succeeded', trained_tokens=546723, training_file='file-RhnAlFF9kqkrkZJGyYXXAwZ7', validation_file=None, user_provided_suffix=None)


In [None]:
ft_model_name = "ft:gpt-3.5-turbo-0125:personal::97FTe9Ci"
from llama_index.llms.openai import OpenAI

ft_llm = OpenAI(model=ft_model_name, temperature=0.3)

In [None]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(vector_store)

query_engine = index.as_query_engine(similarity_top_k=3, llm=ft_llm)

In [23]:
response = query_engine.query("What is the professor's name for SEP 775?")
print(response)


The professor's name for SEP 775 is Hamidreza Mahyar.
