In [216]:
import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [217]:
nlp = spacy.load('en_core_web_md')
# ! python3 -m spacy download en_core_web_md

In [218]:
# def preprocess_text(texts):
#     processed_texts = []
#     lemmatizer = WordNetLemmatizer()
#     for text in texts:
#         text = text.lower()
#         # Tokenize sentences and words
#         sentences = word_tokenize(text)
#         tokens = [sent_tokenize(sentence) for sentence in sentences]
#         # Lemmatize tokens
#         tokens = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in tokens]
#         # Flatten and clean up tokens
#         processed_sentences = [" ".join(sentence) for sentence in tokens]
#         processed_texts.append(processed_sentences)
#     return processed_texts

In [219]:
def preprocess_text(texts):
    processed_texts = []
    lemmatizer = WordNetLemmatizer()
    for text in texts:
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        sentences = sent_tokenize(text)  # Tokenize into sentences
        tokens = [word_tokenize(sentence) for sentence in sentences]  # Tokenize each sentence into words
        tokens = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in tokens]  # Lemmatize each word
        # Flatten sentences into a single list of words for the whole document
        flat_tokens = [word for sentence in tokens for word in sentence]
        processed_texts.append(" ".join(flat_tokens))  # Join tokens back into a single string
    return processed_texts

In [220]:
b =  ["Brick Work Supplying preparing and construction of chimney made brick work of approved quality with 1:2 chuna ( white cement) : Surkhi mortar ratio with 30 m lead",
    "1:2:4 PCC Works",]

a = preprocess_text(b)
a

['brick work supplying preparing and construction of chimney made brick work of approved quality with 12 chuna white cement surkhi mortar ratio with 30 m lead',
 '124 pcc work']

In [222]:
sample_texts = ["Brick Work Supplying preparing and construction of chimney made brick work of approved quality with 1:2 chuna ( white cement) : Surkhi mortar ratio with 30 m lead",
    "1:2:4 PCC Works",]
sample_sentences_processed = preprocess_text(sample_texts)
sample_sentences_processed

['brick work supplying preparing and construction of chimney made brick work of approved quality with 12 chuna white cement surkhi mortar ratio with 30 m lead',
 '124 pcc work']

In [223]:
len(sample_sentences_processed)

2

In [224]:
# Convert text to a vector using spaCy
def document_to_vector_spacy(doc_text, nlp_model):
    doc = nlp_model(doc_text)
    return doc.vector

# Compute vectors for a list of documents
def compute_spacy_vectors(documents, nlp_model):
    vectors = [document_to_vector_spacy(doc, nlp_model) for doc in documents]
    return np.array(vectors)

In [225]:
def calculate_cosine_similarity(vector1, vector2):
    similarity = cosine_similarity([vector1], [vector2])
    return similarity[0][0]

In [226]:
for sentence in sample_sentences_processed:
    sample_vector = compute_spacy_vectors(sentence , nlp)

In [231]:
sample_vector.shape

(12, 300)