In [1]:
# Install the sentence-transformers library
!pip install sentence-transformers joblib

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Predefined set of Russian stopwords
russian_stopwords = set(stopwords.words('russian'))

In [4]:
# Function to clean text
def clean_text(text):
    text = text.replace('"', "'")  # Replace double quotes with single quotes
    text = re.sub(r'[^\w\s]', '', text).lower()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in russian_stopwords]
    return ' '.join(tokens)

In [5]:
# Function to expand text with synonyms
def expand_with_synonyms(text, synonyms_dict):
    tokens = word_tokenize(text)
    expanded_tokens = []
    for token in tokens:
        expanded_tokens.append(token)
        if token in synonyms_dict:
            expanded_tokens.append(synonyms_dict[token])
    return ' '.join(expanded_tokens)

In [6]:
# Function to split text into chunks with overlap
def split_into_chunks(text, max_chunk_size=850, overlap_size=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_chunk_size - overlap_size):
        chunk = words[i:i + max_chunk_size]
        chunks.append(' '.join(chunk))
    return chunks

In [7]:
# Function to preprocess documents
def preprocess_documents(df, synonyms_dict):
    df['cleaned'] = df['Answer'].apply(lambda x: expand_with_synonyms(clean_text(x), synonyms_dict))
    return df

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Read abbreviation CSV and convert to dictionary
abbr = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/abbr.csv')
abbr.columns = ['abbr', 'full']
synonyms_dict = pd.Series(abbr.full.values, index=abbr.abbr).to_dict()

In [10]:
# Read CSV files
drk = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_drk_df.csv')
cmk = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_cmk_df.csv')
ftl = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_ftl_df.csv')
otr = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_otr_df.csv')
ft = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_ft_df.csv')
# Print column names to debug
print(f"DRK columns: {drk.columns}")
print(f"CMK columns: {cmk.columns}")
print(f"FTL columns: {ftl.columns}")
print(f"OTR columns: {otr.columns}")
print(f"FT columns: {ft.columns}")

DRK columns: Index(['Query', 'Answer', 'Link', 'Department'], dtype='object')
CMK columns: Index(['Query', 'Answer', 'Link', 'Department'], dtype='object')
FTL columns: Index(['Query', 'Answer', 'Link', 'Department'], dtype='object')
OTR columns: Index(['Query', 'Answer', 'Link', 'Department'], dtype='object')
FT columns: Index(['Query', 'Answer', 'Link', 'Department'], dtype='object')


In [11]:
# Preprocess the data
drk = preprocess_documents(drk, synonyms_dict)
cmk = preprocess_documents(cmk, synonyms_dict)
ftl = preprocess_documents(ftl, synonyms_dict)
otr = preprocess_documents(otr, synonyms_dict)
ft = preprocess_documents(ft, synonyms_dict)

In [12]:
# Combine all dataframes into a single one
combined_df = pd.concat([drk, cmk, ftl, otr, ft], ignore_index=True)
# Print column names to ensure 'cleaned' column exists
print(f"Combined columns: {combined_df.columns}")

Combined columns: Index(['Query', 'Answer', 'Link', 'Department', 'cleaned'], dtype='object')


In [13]:
# Split the preprocessed text into chunks
all_chunks = []
all_departments = []
all_queries = []
all_links = []

In [14]:
for idx, row in combined_df.iterrows():
    chunks = split_into_chunks(row['cleaned'], max_chunk_size=850, overlap_size=100)
    all_chunks.extend(chunks)
    all_departments.extend([row['Department']] * len(chunks))
    all_queries.extend([row['Query']] * len(chunks))  # Ensure 'Query' is the column with the original queries
    all_links.extend([row['Link']] * len(chunks))  # Ensure 'Link' is the column with the links

In [15]:
chunked_df = pd.DataFrame({
    'Department': all_departments,
    'cleaned_chunk': all_chunks,
    'Original_Query': all_queries,
    'Link': all_links
})

In [16]:
print(f"Total chunks: {len(chunked_df)}")
print(f"Chunks per department:\n{chunked_df['Department'].value_counts()}")

Total chunks: 2111
Chunks per department:
Department
FT     1144
DRK     472
CMK     248
OTR     225
FTL      22
Name: count, dtype: int64


In [17]:
# Load a multilingual sentence transformer model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
# Generate embeddings for the chunks
chunked_df['embedding'] = chunked_df['cleaned_chunk'].apply(lambda x: model.encode(x, convert_to_tensor=True))

In [20]:
import joblib
# Save the embeddings and associated data
joblib.dump(chunked_df, 'vector_database.pkl')
print("Embeddings and associated data saved to 'vector_database.pkl'.")

Embeddings and associated data saved to 'vector_database.pkl'.


In [21]:
# Function to search within the same department using dense vectors
def search_within_department(query, department):
    query_cleaned = clean_text(query)
    query_expanded = expand_with_synonyms(query_cleaned, synonyms_dict)
    query_embedding = model.encode(query_expanded, convert_to_tensor=True)

    dept_df = chunked_df[chunked_df['Department'] == department]

    if dept_df.empty:
        return pd.DataFrame(columns=['Department', 'cleaned_chunk', 'similarity', 'Original_Query', 'Link'])

    # Calculate cosine similarity between the query and the department's chunks
    dept_df['similarity'] = dept_df['embedding'].apply(lambda x: util.pytorch_cos_sim(query_embedding, x).item())

    results = dept_df.sort_values(by='similarity', ascending=False)
    return results

In [22]:
# Example usage
query = "Была ли доработка документа 'Приемка'?"
department = "FT"  # Change to the desired department
results = search_within_department(query, department)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dept_df['similarity'] = dept_df['embedding'].apply(lambda x: util.pytorch_cos_sim(query_embedding, x).item())


In [23]:
display(results)

Unnamed: 0,Department,cleaned_chunk,Original_Query,Link,embedding,similarity
1373,FT,бизнес требования 220820233110изменение макета...,ФТ (EPIC-11622),http://confluence/pages/viewpage.action?pageId...,"[tensor(-0.1057), tensor(0.2197), tensor(-0.18...",0.710300
1119,FT,ункциональные требования цельназначение предос...,ФТ (EPIC-10568),http://confluence/pages/viewpage.action?pageId...,"[tensor(-0.0684), tensor(0.0920), tensor(-0.23...",0.699312
1125,FT,пегас формируется документприемки заполненными...,ФТ (EPIC-10568),http://confluence/pages/viewpage.action?pageId...,"[tensor(-0.0336), tensor(0.0899), tensor(-0.28...",0.693297
1549,FT,документоввыводить дату документа спорейс кото...,ФТ (EPIC-11961),http://confluence/pages/viewpage.action?pageId...,"[tensor(-0.1203), tensor(0.2711), tensor(-0.22...",0.684385
1121,FT,итоги заполнять текущему функционалу примечани...,ФТ (EPIC-10568),http://confluence/pages/viewpage.action?pageId...,"[tensor(0.0006), tensor(0.2639), tensor(-0.147...",0.682616
...,...,...,...,...,...,...
1389,FT,nillabletrue nillable true typexsstring type x...,ФТ (EPIC-11858),http://confluence/pages/viewpage.action?pageId...,"[tensor(-0.0416), tensor(-0.1185), tensor(0.00...",0.084005
1564,FT,id ownershiptype organization type0type person...,ФТ (EPIC-11305),http://confluence/pages/viewpage.action?pageId...,"[tensor(0.0051), tensor(0.1078), tensor(-0.064...",0.077640
1912,FT,тарифа базовый весовой расчетный базовый 330 в...,ФТ (EPIC-13169),http://confluence/pages/viewpage.action?pageId...,"[tensor(-0.0737), tensor(0.1314), tensor(-0.06...",0.076481
1354,FT,worktime periodtimefrom 000000 periodtimeto 23...,ФТ (EPIC-11892),http://confluence/pages/viewpage.action?pageId...,"[tensor(-0.2243), tensor(0.0737), tensor(0.101...",0.049528
