In [1]:
import pandas as pd
from pprint import pprint 
import json
import openai
import re
import os
import pinecone
import time

import nltk
from nltk.tokenize import word_tokenize


import langchain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings

from langchain.schema import ( SystemMessage, HumanMessage, AIMessage )


from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

  from tqdm.autonotebook import tqdm


True

In [2]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")

In [3]:
# Define the path to the JSON file
file_path = "./Data/jira-conversations2.json"

# Read the JSON file with utf-8 encoding
with open(file_path, "r", encoding='utf-8') as f:
    data = json.load(f)

In [4]:
json_data = data[0:49]

In [5]:
def clean_text(text):
    text = ' '.join(text.split())  # Remove extra whitespaces and newlines
    
    # Remove different kinds of unwanted patterns from text
    # Remove unwanted patterns
    text = re.sub(r'~+\+~+', '', text)  # Remove ~+~+
    text = re.sub(r'\+\~', '', text)  # Remove +~
    text = re.sub(r'----', '', text)  # Remove ----
    text = re.sub(r'\+\+', '', text)    # Remove ++
    

    text = re.sub(r'~accountid:[a-zA-Z0-9]+', '', text)
    text = re.sub(r'\{"type".*?\}\]', '', text)
    text = re.sub(r'\{adf\}.*?\{adf\}', '', text)  # Remove "{adf} ... {adf}" and content within
    text = re.sub(r'\[~~email~~-?\d+~~\]', '', text)  # Remove `[~~email~~-numbers~~]`
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = re.sub(r'<\[\[~~email~~-?\d+~~\]\]', '', text)  # Remove `<[[~~email~~-numbers~~]]>`
    text = re.sub(r'\|mailto:\[~~email~~-?\d+~~\]', '', text)  # Remove email tags
    text = re.sub(r'—-—-—-—', '', text)  # Remove "—-—-—-—"
    text = re.sub(r'\w{3}, \d{1,2} \w{3} \d{4}, \d{1,2}:\d{2} [apmAPM]{2}', '', text)  # Remove timestamps
    text = re.sub(r'\|\s+\|\s+You don\'t often get email from', '', text)  # Remove headers
    text = re.sub(r'\[Powered by Service Management.*?\]', '', text)  # Remove "[Powered by...]"
    text = re.sub(r'\[View request.*?&reserved=0\]', '', text)  # Remove "[View request...]"
    text = re.sub(r'\*\*\*Please reply above this line\*\*\*', '', text)  # Remove reply line
    text = re.sub(r'\|', '', text)  # Remove "|"
    text = re.sub(r'_', '', text)  # Remove "_"
    text = re.sub(r'\[mailto:\]', '', text)  # Remove "[mailto:]"
    text = re.sub(r'\[|\]', '', text)  # Remove "[" and "]"
    text = re.sub(r'<|>', '', text)  # Remove "<" and ">"
    text = re.sub(r'\*', '', text)  # Remove "*"
    text = re.sub(r'!jira[-a-zA-Z0-9 ()]+!', '', text)  # Remove Jira text
    
    return text

# Clean text in conversations
for conv in json_data:
    for key, value in conv.items():
        if isinstance(value, str):  # Check if the value is a string
            try:
                conv[key] = clean_text(value)
            except Exception as e:
                print(f"Error cleaning text in '{key}': {e}")
        else:
            print(f"Skipped cleaning text for key '{key}' as it's not a string.")

In [6]:
texts = [', '.join(f"'{k}': '{v}'" for k, v in item.items()) for item in json_data]

In [7]:
def count_tokens(json_data):
    if isinstance(json_data, dict):
        return sum(count_tokens(v) for v in json_data.values())
    elif isinstance(json_data, list):
        return sum(count_tokens(item) for item in json_data)
    elif isinstance(json_data, str):
        return len(json_data.split())
    else:
        return 0


count_tokens(json_data)

11216

In [8]:
len(texts)

49

In [9]:
texts

["'question000001': 'Hii This is  gadipally UID-U6331114. In process of submitting DS160. I need to send address and phone number of point of contact(school official).So can you please send address and phone number of IRIS BRITO(School official to contact uon arrival).Can i also know the first and last name of IRIS BRITO. Thank you. ', 'response000002': 'Thank you for reaching out. You are able to add  at . Best, '",
 "'question000001': 'Hello, I am  with ID - U20309912. I have not received any mail regarding academic integrity course . Can you please send the link for academic integrity course and details regarding it?! Thanks in advance. ', 'response000002': 'Hello, provide me with your USF email.', 'question000003': 'Hello, USF mail Id is  Thanks in advance   ', 'response000004': 'Canvas invite has been sent via email.', 'question000005': 'Hello , I didn’t get any canvas invite to my mail. Can you please send it again?! Thanks in advance   ', 'response000006': 'Invite has been sent 

In [10]:
token_counts = [len(text.split()) for text in texts]

sorted_token_counts = sorted(token_counts, reverse=True)

print(sorted_token_counts)

[1264, 605, 401, 389, 363, 344, 339, 339, 334, 323, 313, 290, 283, 263, 249, 224, 220, 214, 202, 197, 197, 196, 194, 193, 190, 190, 188, 187, 184, 183, 180, 176, 175, 174, 166, 158, 147, 126, 122, 120, 113, 111, 111, 104, 101, 91, 69, 57, 49]


In [11]:
def split_into_chunks(text, max_tokens=1000):
    words = text.split()
    chunks = []
    current_chunk = []
    current_tokens = 0

    for word in words:
        if current_tokens + len(word.split()) <= max_tokens:
            current_chunk.append(word)
            current_tokens += len(word.split())
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_tokens = len(word.split())

    chunks.append(' '.join(current_chunk))
    return chunks

In [12]:
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

    def __repr__(self):
        return f"Document(page_content='{self.page_content}', metadata={self.metadata})"

# Assuming the 'texts' list from previous code
documents_list = [Document(page_content=text, metadata={'text': text}) for text in texts]

In [13]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [14]:
if PINECONE_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        PINECONE_INDEX_NAME,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(PINECONE_INDEX_NAME).status['ready']:
        time.sleep(1)
        

index = pinecone.Index(PINECONE_INDEX_NAME)

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.05939,
 'namespaces': {'': {'vector_count': 5939}},
 'total_vector_count': 5939}

In [16]:
embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [17]:
PINECONE_INDEX_NAME

'langchain-index'

In [18]:
index_name = PINECONE_INDEX_NAME

In [19]:
embeddings = OpenAIEmbeddings(deployment="text-embedding-ada-002") #EXPENSIVE - - - USE CAREFULLY

In [20]:
print(f"Total number of Documents is: {len(documents_list)}")

for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Split the document into chunks
        chunks = split_into_chunks(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            # Vectorize the chunk
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 49
Processing document 1
Processing document 2


KeyboardInterrupt: 

In [None]:
# Do a simple vector similarity search
query = "I am having a doubt"

result = search.similarity_search(query)
result

[Document(page_content="'question000001': 'Professor , I am having a doubt that we can take subject online in summer and go out USA(India) and complete the course ? Thanks, ', 'response000002': 'Hi, Did you not get the email from ISS? It explains the situation. Han , PhD Professor and Graduate Coordinator, MS BAIS College of Business, University of South Florida'"),
 Document(page_content="'question000001': 'Hi Greetings of the day, I am USF ID: Term of Entry: Fall 2023 I had got an admit letter for is Business Analytics and Information systems in USF for Fall 2023, but in the admit letter which is attached below, it is mentioned that, you are provisionally admitted since we are missing some documents i.e 1. Final Official Transcripts, 2. Awarded. Actually i uploaded all my transcripts and Provisional Certificate at the time of the admission application itself, Here in attachments i am attaching the screenshot of the same, I uploaded my visa and financial documents in the istart portal

In [None]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

In [None]:
scrapped_pages_dir = "./Data/Scrapped Pages"

In [None]:
# Traverse the directory and get all .txt files
txt_files = [f for f in os.listdir(scrapped_pages_dir) if f.endswith('.txt')]

In [None]:
# Read the contents of each file and store in a list
documents_list = []
for file in txt_files:
    with open(os.path.join(scrapped_pages_dir, file), 'r', encoding='utf-8') as f:
        content = f.read()
        documents_list.append(Document(page_content=content, metadata={'text': content}))

In [None]:
print(f"Total number of Documents is: {len(documents_list)}")

for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Split the document into chunks
        chunks = split_into_chunks(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            # Vectorize the chunk
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 91
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14
Processing document 15
Processing document 16
Processing document 17
Processing document 18
Processing document 19
Processing document 20
Processing document 21
Processing document 22
Processing document 23
Processing document 24
Processing document 25
Processing document 26
Processing document 27
Processing document 28
Processing document 29
Processing document 30
Processing document 31
Processing document 32
Processing document 33
Processing document 34
Processing document 35
Processing document 36
Processing document 37
Processing document 38
Processing document 39
Processing document 40
Processing document 41
Processing document 42
Processing

In [None]:
# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

query = "Michelle Jahn"
vectorstore.similarity_search(query, k=3)

[Document(page_content='JONI JONES Campus Dean - Muma College of Business, Sarasota-Manatee Associate Professor jonijones@usf.edu Room: C216 Phone: 941-359-4234 Fax: (941) 359-4489 Vita Joni Jones is an associate professor in the School of Information Systems and Management. She also serves as campus dean for the Muma College of Business on the Sarasota-Manatee campus. She teaches graduate and undergraduate courses in systems analysis and design, business honors professional development and research methods. She previously taught introductory courses in computing as well as courses in C#, managerial statistics, business system application and design and software applications. Her research interests include electronic commerce, variable pricing mechanisms such as information and prediction markets and social network use in organizations. Her research has been published in the MIS Quarterly, Production and Operations Management, the Journal of E-Commerce, the INFORMS Journal on Computing

In [None]:
text_folder_path = "./Data/Syllabus 2023-selected/Text"

txt_files = [f for f in os.listdir(text_folder_path) if f.endswith('.txt')]

# Read the contents of each file and store in a list
documents_list = []
for file in txt_files:
    with open(os.path.join(text_folder_path, file), 'r', encoding='utf-8') as f:
        content = f.read()
        documents_list.append(Document(page_content=content, metadata={'text': content}))

In [None]:
print(f"Total number of Documents is: {len(documents_list)}")

for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Assuming split_into_chunks is a function you have defined elsewhere
        chunks = split_into_chunks(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            # Vectorize the chunk. Assuming embeddings and index_name are defined elsewhere.
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

Total number of Documents is: 282
Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6


KeyboardInterrupt: 

In [None]:
documents_list[0]

Document(page_content='COURSE SYLLABUS 
MAN3301:  Human Resource Management  
School of In formation Sys tems and Management  
Muma College of Business, University of South Florida 
Semester: Spring  2023 
Class Meeting Time: from January 10  to May  5, 202 2 
Class Meeting Location: Online  
Instructor: Prof. Terry A. Boyd  
Office Location: BSN 3516  
Office Hours: 5:00 – 6:00PM T hurs days (from August  23 to December  10, 202 1) or by appointment 
Email: boyd56@usf.edu   
I. Welcome Message
Welcome to Human Resource Management. This co urse wi ll introduce yo u to co ncepts found  
in the human resource literat ure that wi ll help yo u as yo u progres s in your care er. This co urse 
will help yo u to become fully acquainted with the academic co ncepts of HRM and apply them 
to day to day sit uatio ns at the workplace. Th e course wi ll be taugh t from the pers pective that 
you are or wi ll someday be a leader in an organizatio n with a grasp of the benefits of HRM.
II. University

In [None]:
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

query = "USF Student Conduct Code"
sample = vectorstore.similarity_search(query, k=3)
sample

[Document(page_content='effort to provide support and equal access, USF has designated all faculty (TA, Ad junct, etc.) as Responsible Employees, who are required to report any disclosures of sexual harassment, sexual violence, relationship violence or stalking. The Title IX Office makes every effort, when safe to do so, to reach out and provide resources and a ccommodations, and to discuss possible options for resolution. Anyone wishing to make a Title IX report or seeking accommodations may do so online, in person, via phone, or email to the Title IX Office. For information about Title IX or for a full list of resources please visit: https://www.usf.edu/title - ix/gethelp/resources.aspx . If you are unsure what to do, please contact Victim Advocacy – a confidential resource that can re view all your options – at 813 -974-5756 or va@admin.usf.edu . Course Hero / Chegg Policy: The USF Policy on Aca demic Integrity specifies that students may not use websites that enable cheating, such 

In [None]:
# Tokenize the content of sample
tokens = word_tokenize(str(sample))

# Calculate the number of tokens
num_tokens = len(tokens)

print(f"The number of tokens in sample is: {num_tokens}")

The number of tokens in sample is: 3099


In [20]:
canvas_pages_dir = "./Data/Canvas/Text"

In [21]:
# Traverse the directory and get all .txt files
txt_files = [f for f in os.listdir(canvas_pages_dir) if f.endswith('.txt')]

In [23]:
# Read the contents of each file and store in a list
documents_list = []
for file in txt_files:
    with open(os.path.join(canvas_pages_dir, file), 'r', encoding='utf-8') as f:
        content = f.read()
        documents_list.append(Document(page_content=content, metadata={'text': content}))
print(f"Total number of Documents is: {len(documents_list)}")


Total number of Documents is: 14


In [24]:
for i in range(len(documents_list)):
    try:
        document = documents_list[i]
        print(f"Processing document {i + 1}")

        # Split the document into chunks
        chunks = split_into_chunks(document.page_content)

        for chunk in chunks:
            chunk_document = Document(page_content=chunk, metadata={'text': chunk})
            # Vectorize the chunk
            search = Pinecone.from_documents([chunk_document], embeddings, index_name=index_name)

    except Exception as e:
        print(f"Error processing document {i + 1}: {e}")

Processing document 1
Processing document 2
Processing document 3
Processing document 4
Processing document 5
Processing document 6
Processing document 7
Processing document 8
Processing document 9
Processing document 10
Processing document 11
Processing document 12
Processing document 13
Processing document 14


In [25]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.05959,
 'namespaces': {'': {'vector_count': 5959}},
 'total_vector_count': 5959}