# Textual Analysis
You should create an seperate `yaml` for each of the project you are working on, this is a good practice in general. Thesre migt be weird issues that causes a dependency problem. For the same reason a new `yaml` file would record all the dependencies that would ensure this pipeline works for future references.

In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import sys

current_folder = Path.cwd()
parent_folder = current_folder.parent
sys.path.insert(0, str(parent_folder))
print(parent_folder)

import plotly.io as pio
pio.renderers.keys()
pio.renderers.default = 'notebook' 

import pandas as pd
import numpy as np
import os
import glob
import nltk
nltk.download('punkt')
from utils.clean_text import transform_text

/Users/kevinb/Desktop/dsc80/Chatable-Study-Database


[nltk_data] Downloading package punkt to /Users/kevinb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [2]:
current_folder = Path.cwd()
parent_folder = current_folder.parent
os.chdir(parent_folder)
print(parent_folder)

# text data
fall_2022_text = pd.read_csv('data/2022_fall_text.csv')
winter_2023_text = pd.read_csv('data/2023_winter_text.csv')
spring_2023_text = pd.read_csv('data/2023_spring_text.csv')
summer_2023_text = pd.read_csv('data/2023_summer_text.csv')
fall_2023_text = pd.read_csv('data/2023_fall_text.csv')
winter_2024_text = pd.read_csv('data/2024_winter_text.csv')
spring_2024_text = pd.read_csv('data/2024_spring_text.csv')
summer_2024_text = pd.read_csv('data/2024_summer_text.csv')
fall_2024_text = pd.read_csv('data/2024_fall_text.csv')
text = pd.concat([fall_2022_text, winter_2023_text, spring_2023_text, summer_2023_text, fall_2023_text, winter_2024_text, spring_2024_text, summer_2024_text, fall_2024_text], axis=0)

data_dir = 'data'
file_pattern = os.path.join(data_dir, '*_text.csv')
csv_files = glob.glob(file_pattern)
csv_files.sort()
df_list = [pd.read_csv(file) for file in csv_files]
text = pd.concat(df_list, axis=0, ignore_index=True)
print(f"Total Texts: {text.shape[0]}")

/Users/kevinb/Desktop/dsc80/Chatable-Study-Database
Total Texts: 689


In [3]:
clean_text = text.pipe(transform_text)
clean_text

Unnamed: 0,Time,Quarter,Month,Study Materials
0,2022-02-14,2022Q1,February,2022-02-14 | 2022Q1 | February | Book read (30...
1,2022-02-15,2022Q1,February,2022-02-15 | 2022Q1 | February | Nutrition rea...
2,2022-10-21,2022Q4,October,2022-10-21 | 2022Q4 | October |
3,2022-10-22,2022Q4,October,2022-10-22 | 2022Q4 | October | After consecut...
4,2022-10-23,2022Q4,October,2022-10-23 | 2022Q4 | October | Wolfram good t...
...,...,...,...,...
628,2024-12-11,2024Q4,December,2024-12-11 | 2024Q4 | December | RPLH eval + p...
629,2024-12-12,2024Q4,December,2024-12-12 | 2024Q4 | December | track-mjx mee...
630,2024-12-13,2024Q4,December,2024-12-13 | 2024Q4 | December | constraint ar...
631,2024-12-14,2024Q4,December,2024-12-14 | 2024Q4 | December | chill


In [4]:
pre_process = (clean_text['Study Materials']
              .str.lower()
              .str.replace(r'\([\d]*m\)','',regex=True)
              .str.replace(',','')
              .str.strip())

corpus = ' '.join(pre_process.astype(str).to_list())
tokens = nltk.tokenize.word_tokenize(corpus, language='english')
len(tokens)

18487

# Making Chat Familier With My Data

Using either of `sentence-transformer`, `nltk`, `openai`, `langchain`, or related stuff has many dependency issue if just pyt in a big environment, need to have a seperated contained environment.

In [5]:
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import ast
import openai
import os

nltk.download('punkt')
model = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package punkt to /Users/kevinb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
from utils.clean_text import split_text_nltk, get_similar_chunks, generate_response

documents = clean_text['Study Materials'].tolist()
all_chunks = []
for doc in documents:
    chunks = split_text_nltk(doc)
    all_chunks.extend(chunks)

print(f"Total chunks created: {len(all_chunks)}")
embeddings = model.encode(all_chunks, show_progress_bar=True, convert_to_tensor=False)

embedding_df = pd.DataFrame({
    'chunk': all_chunks,
    'embedding': embeddings.tolist(),
    'quarter': clean_text['Quarter'],
    'time': clean_text['Time'],
    'month':clean_text['Month']
})

embedding_df.to_csv('embeddings/embeddings.csv', index=False)
print("Embeddings saved to embeddings.csv")

Total chunks created: 633


Batches: 100%|██████████| 20/20 [00:02<00:00,  7.53it/s]

Embeddings saved to embeddings.csv





In [7]:
embedding_df = pd.read_csv('embeddings/embeddings.csv')
embedding_df['embedding'] = embedding_df['embedding'].apply(ast.literal_eval)

# Convert embeddings to a NumPy array of type float32
embeddings = np.array(embedding_df['embedding'].tolist()).astype('float32')

# Initialize FAISS index and using L2 distance, can also use cosine similarity
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add embeddings to the index
index.add(embeddings)
print(f"FAISS index has {index.ntotal} vectors.")

FAISS index has 633 vectors.


In [8]:
embedding_df

Unnamed: 0,chunk,embedding,quarter,time,month
0,2022-02-14 | 2022Q1 | February | Book read (30...,"[-0.03661457449197769, -0.006186727900058031, ...",2022Q1,2022-02-14,February
1,2022-02-15 | 2022Q1 | February | Nutrition rea...,"[-0.0576658695936203, -0.05012887343764305, 0....",2022Q1,2022-02-15,February
2,2022-10-21 | 2022Q4 | October |,"[-0.0364847257733345, -0.021536780521273613, 0...",2022Q4,2022-10-21,October
3,2022-10-22 | 2022Q4 | October | After consecut...,"[-0.08645892143249512, -0.028704402968287468, ...",2022Q4,2022-10-22,October
4,2022-10-23 | 2022Q4 | October | Wolfram good t...,"[-0.05059235915541649, 0.02907039225101471, -0...",2022Q4,2022-10-23,October
...,...,...,...,...,...
628,2024-12-11 | 2024Q4 | December | RPLH eval + p...,"[-0.08534425497055054, 0.06758847087621689, 0....",2024Q4,2024-12-11,December
629,2024-12-12 | 2024Q4 | December | track-mjx mee...,"[-0.030629420652985573, 0.00898802001029253, 0...",2024Q4,2024-12-12,December
630,2024-12-13 | 2024Q4 | December | constraint ar...,"[-0.08368362486362457, 0.008619057945907116, 0...",2024Q4,2024-12-13,December
631,2024-12-14 | 2024Q4 | December | chill,"[-0.04731657728552818, -0.005331708118319511, ...",2024Q4,2024-12-14,December


In [9]:
api_key = "..." #os.getenv('OPENAI_API_KEY')

In [10]:
# user_prompt = "What did I mainly do in 2022 fall quarter?"
# similar_chunks = get_similar_chunks(user_prompt, index, embedding_df, top_k=5)
# generate_response(user_prompt, similar_chunks, api_key=api_key)

In [11]:
# user_prompt = "What did I mainly do in 2023 fall quarter?"
# similar_chunks = get_similar_chunks(user_prompt, index, embedding_df, top_k=5)
# generate_response(user_prompt, similar_chunks, api_key=api_key)

In [12]:
# user_prompt = "What did I mainly do in 2024 fall quarter?"
# similar_chunks = get_similar_chunks(user_prompt, index, embedding_df, top_k=5)
# generate_response(user_prompt, similar_chunks, api_key=api_key)