# Textual Analysis
You should create an seperate `yaml` for each of the project you are working on, this is a good practice in general. Thesre migt be weird issues that causes a dependency problem. For the same reason a new `yaml` file would record all the dependencies that would ensure this pipeline works for future references.

In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import sys

current_folder = Path.cwd()
parent_folder = current_folder.parent
sys.path.insert(0, str(parent_folder))
print(parent_folder)

import plotly.io as pio
pio.renderers.keys()
pio.renderers.default = 'notebook' 

import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from utils.clean_text import transform_text, tokenize

In [22]:
# text data
fall_2022_text = pd.read_csv('data/2022_fall_text.csv')
winter_2023_text = pd.read_csv('data/2023_winter_text.csv')
spring_2023_text = pd.read_csv('data/2023_spring_text.csv')
summer_2023_text = pd.read_csv('data/2023_summer_text.csv')
fall_2023_text = pd.read_csv('data/2023_fall_text.csv')
winter_2024_text = pd.read_csv('data/2024_winter_text.csv')
spring_2024_text = pd.read_csv('data/2024_spring_text.csv')
summer_2024_text = pd.read_csv('data/2024_summer_text.csv')
fall_2024_text = pd.read_csv('data/2024_fall_text.csv')

In [None]:
# Initial Concat
text = pd.concat([fall_2022_text, winter_2023_text, spring_2023_text, summer_2023_text, fall_2023_text, winter_2024_text, spring_2024_text, summer_2024_text, fall_2024_text], axis=0)
clean_text = text.pipe(transform_text)
clean_text

In [None]:
pre_process = (clean_text['Study Materials']
              .str.lower()
              .str.replace(r'\([\d]*m\)','',regex=True)
              .str.replace(',','')
              .str.strip())

corpus = ' '.join(pre_process.astype(str).to_list())
tokens = nltk.tokenize.word_tokenize(corpus, language='english')
len(tokens)

# Making Chat Familier With My Data

Using either of `sentence-transformer`, `nltk`, `openai`, `langchain`, or related stuff has many dependency issue if just pyt in a big environment, need to have a seperated contained environment.

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import ast
import openai
import os

nltk.download('punkt')
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
from utils.clean_text import split_text_nltk, get_similar_chunks, generate_response

documents = clean_text['Study Materials'].tolist()
all_chunks = []
for doc in documents:
    chunks = split_text_nltk(doc)
    all_chunks.extend(chunks)

print(f"Total chunks created: {len(all_chunks)}")
embeddings = model.encode(all_chunks, show_progress_bar=True, convert_to_tensor=False)

embedding_df = pd.DataFrame({
    'chunk': all_chunks,
    'embedding': embeddings.tolist(),
    'quarter': clean_text['Quarter'],
    'time': clean_text['Time'],
    'month':clean_text['Month']
})

embedding_df.to_csv('embeddings/embeddings.csv', index=False)
print("Embeddings saved to embeddings.csv")

In [None]:
embedding_df = pd.read_csv('embeddings/embeddings.csv')
embedding_df['embedding'] = embedding_df['embedding'].apply(ast.literal_eval)

# Convert embeddings to a NumPy array of type float32
embeddings = np.array(embedding_df['embedding'].tolist()).astype('float32')

# Initialize FAISS index and using L2 distance, can also use cosine similarity
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(embeddings)
print(f"FAISS index has {index.ntotal} vectors.")

In [None]:
embedding_df

In [29]:
api_key = "..." #os.getenv('OPENAI_API_KEY')

In [31]:
user_prompt = "What did I mainly do in 2022 fall quarter?"
similar_chunks = get_similar_chunks(user_prompt, index, embedding_df, top_k=5)
generate_response(user_prompt, similar_chunks, api_key=api_key)

In [None]:
user_prompt = "What did I mainly do in 2023 fall quarter?"
similar_chunks = get_similar_chunks(user_prompt, index, embedding_df, top_k=5)
generate_response(user_prompt, similar_chunks, api_key=api_key)

In [None]:
user_prompt = "What did I mainly do in 2024 fall quarter?"
similar_chunks = get_similar_chunks(user_prompt, index, embedding_df, top_k=5)
generate_response(user_prompt, similar_chunks, api_key=api_key)