In [28]:
import os
import re
import warnings

import pandas as pd
import polars as pl
import numpy as np

# import faiss
from voyager import Index, Space
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util

warnings.filterwarnings("ignore")
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.backends else 'cpu')

In [29]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [30]:
def process_folders(root_folder):
    folder_data = []
    for folder, _, files in os.walk(root_folder):
        dmate_file = [f for f in files if f.endswith('.dmate')]
        if dmate_file:
            dmate_file_path = os.path.join(folder, dmate_file[0])
            with open(dmate_file_path, 'r') as file:
                dmate_content = file.read()
                folder_data.append(
                    {
                        'folder_path': folder,
                        'content': dmate_content,
                        'content_emb': model.encode(dmate_content)
                    }
                )
    return pd.DataFrame(folder_data)
process_folders("/Users/filin_va/ML/projects")

In [31]:
# Set the base directory
folder_path = '../data/raw'

# Create an empty dictionary to store the results
object_dict = {}

# Walk through the directory
for root, dirs, files in os.walk(folder_path):
    # Add files to the dictionary with their relative paths
    for file in files:
        relative_path = '../data/raw' + '/' + os.path.relpath(os.path.join(root, file), folder_path)
        object_dict[relative_path] = "file"

    # Add directories to the dictionary with their relative paths
    for dir in dirs:
        relative_path = '../data/raw' + '/' + os.path.relpath(os.path.join(root, dir), folder_path)
        object_dict[relative_path] = "folder"

# Get only those files that end with ".dmate"
dmate_files = [file for file in object_dict if file.endswith(".dmate")]

# Print the dictionary
# print(object_dict)

In [32]:
object_dict

In [33]:
def normalise_text(text: str) -> str:
    """
    Normalise the text in the form of a string, removing technical symbols and stripping leading and trailing whitespace
    :param text: input string to be normalised
    :return: a normalised string
    """
    return re.sub(r'\s+', ' ', text).strip()

In [39]:
# let's create a DataFrame with dmate_files paths and their content
dmate_date = [] 
for dmate_file in dmate_files:
    with open(dmate_file) as file:
        input_text = file.read()
        dmate_date.append(
            {
                'file_path': dmate_file,
                'folder_path': dmate_file[:-6],
                'content': normalise_text(input_text),
                'content_emb': model.encode(input_text)
            }
        )

dmate_date = pd.DataFrame(dmate_date)
dmate_date

In [35]:
# get all files in downloaded
# Get the path to the Downloads folder for the current user
downloads_path = os.path.join(os.path.expanduser('~'), 'Downloads')

# List all files in the Downloads folder
files = os.listdir(downloads_path)
test_download_file = 'documate_test.txt'
print(files)

In [36]:
with open(downloads_path+'/'+test_download_file) as dfile:
    query = dfile.read()
query

In [37]:
query_emb = model.encode(
    query, convert_to_tensor=True, show_progress_bar=False, batch_size=128
)

In [38]:
torch.Tensor(dmate_date['content_emb'])

In [91]:
# F.softmax(torch.Tensor(dmate_date['content_emb'])[:, np.newaxis].to(device) @ query_emb[:, np.newaxis]).ravel()

In [44]:
n_class = np.argmax(util.cos_sim(torch.Tensor(dmate_date['content_emb']).to(device), query_emb[np.newaxis, :]).ravel().cpu())
n_class

In [65]:
dmate_date.iloc[int(n_class)]

In [92]:
n_class = np.argmax((F.softmax(torch.Tensor(dmate_date['content_emb'])[:, np.newaxis].to(device) @ query_emb[:, np.newaxis]).ravel()).cpu().numpy())
n_class

In [76]:
dmate_date['folder_path'][n_class]

In [78]:
import shutil


shutil.move(downloads_path+'/'+test_download_file, dmate_date['folder_path'][n_class])

In [67]:
# 1. create classes extracting from the files. pdf ->
# 2. model
# 3. optimize code for prod
# 4. [ideal] wrap it up - ideally with frontend; just like an app
# 5. MLOps
# 6. experiments

In [106]:
from transformers import pipeline

def summarize_text(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=150, min_length=30, do_sample=False)
    return summary[0]['summary_text']

In [107]:
t = """

I got gensim to work in Google Collab by following this process:

!pip install gensim
from gensim.summarization import summarize

Then I was able to call summarize(some_text)

Now I'm trying to run the same thing in VS code:

I've installed gensim: pip3 install gensim

but when I run

from gensim.summarization import summarize

I get the error

Import "gensim.summarization" could not be resolvedPylancereportMissingImports

I've also tried from gensim.summarization.summarizer import summarize with same error. Regardless I haven't been able to call the function summarize(some_text) outside of Google Collab.

"""
k = summarize_text(t)

In [109]:
k

In [28]:
'/Users/filin_va/Downloads/09.04.02 Искусственный интеллект в промышленности 2024.pd'.split("/")[-1]

In [23]:
int(np.argmax(torch.Tensor([0.1755, 0.1800, 0.1679, 0.1577, 0.1556, 0.1634])))

In [35]:
import fitz
def get_pdf_text(file_path: str) -> str:
    text = ""
    with fitz.open(file_path) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf.load_page(page_num)
            text += page.get_text("text")
    return text

get_pdf_text("/Users/filin_va/Downloads/Dubai_Tour_Agreement.pdf").split(" ")