In [1]:
# Install necessary packages
!pip install nltk
!pip install langchain
!pip install spacy
!python -m spacy download ru_core_news_sm
!pip install pymorphy2
!pip install fasttext

Collecting langchain
  Downloading langchain-0.2.10-py3-none-any.whl (990 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m990.0/990.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.22 (from langchain)
  Downloading langchain_core-0.2.22-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.5/373.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl (25 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.93-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.22->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langs

In [22]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
import fasttext
import pymorphy2

In [23]:
# Load SpaCy model for Russian
nlp = spacy.load("ru_core_news_sm")

In [24]:
# Initialize Pymorphy2 analyzer
morph = pymorphy2.MorphAnalyzer()

In [None]:
# Download the FastText model for Russian
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
!gunzip cc.ru.300.bin.gz

--2024-07-22 02:20:06--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.165.83.79, 18.165.83.35, 18.165.83.91, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.165.83.79|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4496459151 (4.2G) [application/octet-stream]
Saving to: ‘cc.ru.300.bin.gz’


2024-07-22 02:23:08 (23.5 MB/s) - ‘cc.ru.300.bin.gz’ saved [4496459151/4496459151]

gzip: cc.ru.300.bin already exists; do you wish to overwrite (y or n)? 

In [None]:
# Load FastText model for Russian
ft_model_path = 'cc.ru.300.bin'
ft_model = fasttext.load_model(ft_model_path)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read CSV files
drk = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_drk_df.csv')
cmk = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_cmk_df.csv')
ftl = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_ftl_df.csv')
otr = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_otr_df.csv')
ft = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/merged_ft_df.csv')

In [None]:
# Concatenate the dataframes
df = pd.concat([drk, cmk, ftl, otr, ft], ignore_index=True)

In [None]:
# Read abbreviation CSV and convert to dictionary
abbr = pd.read_csv('/content/drive/MyDrive/Team1_Summer_Hackathon/Data/abbr.csv')
abbr.columns = ['abbr', 'full']

In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
russian_stopwords = set(stopwords.words('russian'))

In [None]:
# Define function to clean text
def clean_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [token for token in tokens if token not in russian_stopwords]

    # Lemmatize the text using SpaCy
    doc = nlp(' '.join(tokens))
    lemmatized_text = ' '.join(token.lemma_ for token in doc if not token.is_stop)

    return lemmatized_text

In [None]:
# Initialize synonyms cache
synonyms_cache = abbr.set_index('full')['abbr'].to_dict()

# Define function to get synonyms
def get_synonyms(word):
    if word not in synonyms_cache:
        parsed_word = morph.parse(word)
        if parsed_word:
            base_form = parsed_word[0].normal_form
            synonyms_cache[word] = [base_form]
        else:
            synonyms_cache[word] = []
    return synonyms_cache[word]

In [None]:
# Define function to expand text with synonyms
def expand_with_synonyms(text):
    tokenized = text.split()
    expanded_text = []
    for token in tokenized:
        expanded_text.append(token)
        synonyms = get_synonyms(token)
        if synonyms:
            expanded_text.append(synonyms[0])
    return ' '.join(expanded_text)

In [None]:
# Define function to preprocess documents
def preprocess_documents(df):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=850, chunk_overlap=150)
    chunks = []
    original_chunks = []
    for _, row in df.iterrows():
        title = row['Query']
        paragraphs = row['Answer']
        split_chunks = text_splitter.split_text(paragraphs)
        for chunk in split_chunks:
            cleaned_chunk = clean_text(chunk)
            expanded_chunk = expand_with_synonyms(cleaned_chunk)
            chunks.append((title, expanded_chunk))
            original_chunks.append((title, chunk))
    return chunks, original_chunks

In [None]:
# Define function to create and fit vectorizer
def create_and_fit_vectorizer(chunks):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(chunk for _, chunk in chunks)
    return vectorizer, X, chunks

In [None]:
# Process the data
chunks, original_chunks = preprocess_documents(df)
vectorizer, X, chunks = create_and_fit_vectorizer(chunks)
print(chunks[0], len(chunks), len(original_chunks))

In [None]:
# Input your query and process it
query = "Доступна ли перевозка грузов в Киргизию?"  # Replace with your query
cleaned_query = clean_text(query)
expanded_query = expand_with_synonyms(cleaned_query)
query_vec = vectorizer.transform([expanded_query])

# Increase the importance of the query terms
query_vec *= 1.5  # Multiply the TF-IDF scores by a factor (e.g., 1.5)

In [None]:
# Compute cosine similarity and find the most similar chunks
results = cosine_similarity(X, query_vec).flatten()
top_indices = results.argsort()[-5:][::-1]

In [None]:
# Print the top 5 most similar chunks
print("ДЕЛАЕМ ЗАПРОС: " + query)
print("РЕЗУЛЬТАТЫ РАБОТЫ МОДЕЛИ: ")
for i in top_indices:
    similarity_score = results[i]
    print(f"Similarity: {similarity_score:.4f}")
    print(f"Title: {chunks[i][0]}")
    print(f"Text: {original_chunks[i][1]}")
    print(f"Link: {chunks[i][1]}")
    print()