In [1]:
import pandas as pd
import numpy as np
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim
import gensim.downloader
from gensim.models import word2vec
from gensim.models import KeyedVectors

from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# загружаем данные (предварительно скачал данные с kaggle)
df = pd.read_csv("sample-data.csv")
df.head()

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."


In [3]:
df['description'][0]

'Active classic boxers - There\'s a reason why our boxers are a cult favorite - they keep their cool, especially in sticky situations. The quick-drying, lightweight underwear takes up minimal space in a travel pack. An exposed, brushed waistband offers next-to-skin softness, five-panel construction with a traditional boxer back for a classic fit, and a functional fly. Made of 3.7-oz 100% recycled polyester with moisture-wicking performance. Inseam (size M) is 4 1/2". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>"Silky Capilene 1 fabric is ultralight, breathable and quick-to-dry"</li> <li>"Exposed, brushed elastic waistband for comfort"</li> <li>5-panel construction with traditional boxer back</li> <li>"Inseam (size M) is 4 1/2"""</li></ul><br><br><b>Fabric: </b>3.7-oz 100% all-recycled polyester with Gladiodor natural odor control for the garment. Recyclable through the Common Threads Recycling Program<br><br><b>Weight: </b>99 g (3.5 oz)<br><b

Видно, что в данных есть HTML-теги, адреса URL, различный регистр. Это нужно будет учесть при предобработке данных.

# Предобработка текста

In [4]:
# Загрузка стоп-слов
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
additional_stop_words = {"g", "oz"} #добавляем, т.к. судя по тексту - это некие единицы измерения, которые нам также ни к чему.
stop_words.update(additional_stop_words)

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060]
[nltk_data]     Попытка установить соединение была безуспешной, т.к.
[nltk_data]     от другого компьютера за требуемое время не получен
[nltk_data]     нужный отклик, или было разорвано уже установленное
[nltk_data]     соединение из-за неверного отклика уже подключенного
[nltk_data]     компьютера>
[nltk_data] Error loading punkt: <urlopen error [WinError 10060]
[nltk_data]     Попытка установить соединение была безуспешной, т.к.
[nltk_data]     от другого компьютера за требуемое время не получен
[nltk_data]     нужный отклик, или было разорвано уже установленное
[nltk_data]     соединение из-за неверного отклика уже подключенного
[nltk_data]     компьютера>


In [5]:
def preprocess_text(text):
    # Удаление HTML-тегов
    text = re.sub(r'<.*?>', ' ', text)
    
    # Удаление URL
    text = re.sub(r'http\S+', ' ', text)
    
    # Приведение к нижнему регистру
    text = text.lower()
    
    # Удаление символов
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\(|\)', ' ', text)
    
    # Удаление стоп-слов
    tokens = text.split()
    text = ' '.join([word for word in tokens if word not in stop_words])
    
    # Объединение нескольких пробелов в один
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [6]:
# Применим функцию предобработки ко всему столбцу "description"
df['description'] = df['description'].astype(str).apply(preprocess_text)

In [7]:
df['description'][0]

'active classic boxers theres reason boxers cult favorite keep cool especially sticky situations quickdrying lightweight underwear takes minimal space travel pack exposed brushed waistband offers nexttoskin softness fivepanel construction traditional boxer back classic fit functional fly made recycled polyester moisturewicking performance inseam size recyclable common threads recycling program details silky capilene fabric ultralight breathable quicktodry exposed brushed elastic waistband comfort panel construction traditional boxer back inseam size fabric allrecycled polyester gladiodor natural odor control garment recyclable common threads recycling program weight made mexico'

# Применяем TF-IDF для векторизации

In [8]:
# Создание экземпляра векторизатора
tfidf_vectorizer = TfidfVectorizer(binary=True, max_features=25000)

# Применение векторизатора к столбцу с предобработанными текстами
tfidf_matrix_1 = tfidf_vectorizer.fit_transform(df['description'])
print(tfidf_matrix_1)
tfidf_matrix_1.shape

  (0, 2613)	0.1316024816687609
  (0, 4893)	0.03344919941454921
  (0, 1741)	0.11869673062875985
  (0, 859)	0.11531742775075296
  (0, 2849)	0.10749344337756421
  (0, 2753)	0.11082145676218359
  (0, 1775)	0.11869673062875985
  (0, 114)	0.08136577663115478
  (0, 2980)	0.11781883060667794
  (0, 782)	0.07633216314130924
  (0, 1291)	0.09372645609708062
  (0, 3293)	0.17350762044532403
  (0, 472)	0.08835561831685945
  (0, 4639)	0.14629394061787335
  (0, 1438)	0.0329192998466384
  (0, 571)	0.11225295620099249
  (0, 3811)	0.14629394061787335
  (0, 1083)	0.0329192998466384
  (0, 3216)	0.04444559910977938
  (0, 3363)	0.04444559910977938
  (0, 4414)	0.04444559910977938
  (0, 793)	0.04444559910977938
  (0, 3359)	0.04444559910977938
  (0, 3849)	0.08663170089925053
  (0, 2159)	0.08499358684525433
  :	:
  (499, 5038)	0.09772271056299581
  (499, 4988)	0.13732313041617725
  (499, 4702)	0.14261797502048357
  (499, 4446)	0.12585071929688305
  (499, 2873)	0.1011228761327938
  (499, 4622)	0.09280731325899352


(500, 5053)

Попробуем еще использовать нашу обученную модель tfidf

In [9]:
def tokenize_text(document):
    # Предобработка текста
    document = preprocess_text(document)  
    return document.split()

In [10]:
documents = df['description'].astype(str).tolist()

In [11]:
def tf(documents):
    tf_list = []
    for doc in documents:
        term_counter = Counter(tokenize_text(doc))
        total_term = sum(term_counter.values())
        tf_dict = {term: count/total_term for term, count in term_counter.items()}
        tf_list.append(tf_dict)
    return tf_list

def idf(documents):
    idf_list = {}
    total_words = len(documents)
    for doc in documents:
        terms_in_doc = set(tokenize_text(doc))
        for term in terms_in_doc:
            idf_list[term] = idf_list.get(term, 0) +1
    
    for term, doc_count in idf_list.items():
        idf_list[term] = np.log(total_words/doc_count)

    return idf_list

def tfidf(documents):
    tf_list = tf(documents)
    idf_dict = idf(documents)
    
    tfidf_list = []
    for tf_dict in tf_list:
        tfidf_dict = {term: tf * idf_dict.get(term, 0) for term, tf in tf_dict.items()}
        tfidf_list.append(tfidf_dict)
    
    return tfidf_list

In [12]:
# Вычисление TF-IDF
tfidf_list = tfidf(documents)
tfidf_list

[{'active': 0.03658499118161743,
  'classic': 0.0620969141854411,
  'boxers': 0.11634490933258558,
  'theres': 0.044936162031737246,
  'reason': 0.07487479636653244,
  'cult': 0.06652362551641261,
  'favorite': 0.044936162031737246,
  'keep': 0.02013630501388178,
  'cool': 0.026814747612745993,
  'especially': 0.07487479636653244,
  'sticky': 0.047132807294315016,
  'situations': 0.06652362551641261,
  'quickdrying': 0.027039954034579157,
  'lightweight': 0.013212220313318514,
  'underwear': 0.05817245466629279,
  'takes': 0.0616385037319769,
  'minimal': 0.04005104024730153,
  'space': 0.0422476855098793,
  'travel': 0.04073969583573465,
  'pack': 0.02750340320117911,
  'exposed': 0.11096795628886968,
  'brushed': 0.051090687619279304,
  'waistband': 0.04158968020939884,
  'offers': 0.03043046559407537,
  'nexttoskin': 0.026593673652888207,
  'softness': 0.026593673652888207,
  'fivepanel': 0.07487479636653244,
  'construction': 0.04953554253162699,
  'traditional': 0.0861578498507694

In [13]:
def compute_tfidf_matrix(documents):
    vectorizer = CountVectorizer(stop_words='english')
    vectorizer.fit_transform(documents)
    term_index = vectorizer.vocabulary_
    
    tfidf_list = tfidf(documents)
    
    tfidf_matrix = lil_matrix((len(documents), len(term_index)))
    for doc_index, tfidf_dict in enumerate(tfidf_list):
        for term, tfidf_value in tfidf_dict.items():
            term_idx = term_index.get(term)
            if term_idx is not None:
                tfidf_matrix[doc_index, term_idx] = tfidf_value
                
    return tfidf_matrix.tocsr(), vectorizer.get_feature_names_out()

In [14]:
tfidf_matrix, feature_names = compute_tfidf_matrix(documents)
print(tfidf_matrix)
tfidf_matrix.shape

  (0, 35)	0.03658499118161743
  (0, 112)	0.017812164458165018
  (0, 432)	0.11634490933258558
  (0, 433)	0.11634490933258558
  (0, 447)	0.020395415920158453
  (0, 473)	0.051090687619279304
  (0, 544)	0.02928215017474615
  (0, 669)	0.0620969141854411
  (0, 755)	0.015954927351812507
  (0, 766)	0.008457275248773365
  (0, 815)	0.04953554253162699
  (0, 832)	0.03043046559407537
  (0, 843)	0.026814747612745993
  (0, 946)	0.06652362551641261
  (0, 1259)	0.02238432857509157
  (0, 1336)	0.07487479636653244
  (0, 1372)	0.11096795628886968
  (0, 1432)	0.044936162031737246
  (0, 1494)	0.013653057029370226
  (0, 1499)	0.07487479636653244
  (0, 1571)	0.02079484010469942
  (0, 1671)	0.047132807294315016
  (0, 1691)	0.03169986939718171
  (0, 1723)	0.03169986939718171
  (0, 2102)	0.038304464702118575
  :	:
  (499, 3055)	0.05423546917495592
  (499, 3139)	0.008774423070602366
  (499, 3159)	0.06901826147327808
  (499, 3268)	0.06394994762192603
  (499, 3269)	0.02982458377416371
  (499, 3281)	0.0087744230706

(500, 4938)

Наша функция отрабатывает немногим хуже, поэтому воспользуемся тем, что дал предлагает нам уже готовая написанная библиотека TfidfVectorizer.

# Вычисляем косинусное расстояние:

In [15]:
# Вычисляем попарно косинусное расстояние между векторами в матрице
cosine_sim_matrix = cosine_similarity(tfidf_matrix_1)
# Вывод косинусного сходства между первым и вторым документами
print(cosine_sim_matrix[0, 1])

0.16829656019151024


In [16]:
#уставим порог в 0.8
indexes = np.transpose(np.nonzero(cosine_sim_matrix > 0.8))

In [17]:
# Выводим пары индексов продуктов с косинусным сходством больше 0.8 вместе с их описаниями
print("Product pairs with high cosine similarity:")
for idx1, idx2 in indexes:
    if idx1 < idx2:  # чтобы не дублировать пары
        print(f"Product {idx1} is similar to product {idx2} with cosine similarity {cosine_sim_matrix[idx1, idx2]}")
        print("Description of product", idx1, ":", df['description'].iloc[idx1])
        print("Description of product", idx2, ":", df['description'].iloc[idx2])
        print("\n")

Product pairs with high cosine similarity:
Product 3 is similar to product 158 with cosine similarity 0.9361193477801297
Description of product 3 : alpine guide pants skin climb ice switch rock traverse knifeedge ridge boogie back durable weatherresistant breathable softshell pants keep stride every mountain endeavor midweight stretchwoven polyester wont restrict moves brushed interior maintains nexttoskin comfort way zippered fly keeps things easy harness gusseted zippered cuffs position snaps tiedown loops waterresistant zippered pockets two front two thigh one back hip recyclable common threads recycling program details durable stretchwoven polyester recycled dwr durable water repellent finish water windresistant highly breathable brushed interior nexttoskin comfort waistband belt loops elastic back way zippered fly external pockets two front slash two gluedon thigh one back hip highly waterresistant dwrfinished zippers gusseted zippered cuff position adjustable settings tiedown loo

# Используем W2V для векторизации

In [18]:
# Загружаем предобученную модель
word2vec_model = gensim.downloader.load("word2vec-google-news-300")

In [19]:
# Функция для преобразования каждого текста в вектор
def text_to_vector(text):
    words = text.split()
    word_vectors = [word2vec_model[word] for word in words if word in word2vec_model.key_to_index]
    if word_vectors:
        text_vector = np.mean(word_vectors, axis=0)
    else:
        text_vector = np.zeros((300,))
    return text_vector

# Преобразование всех текстов в векторы из столбца description вашего датафрейма
text_vectors = np.array([text_to_vector(text) for text in df['description'].astype(str)])

In [20]:
text_vectors

array([[-0.0167277 ,  0.0591506 ,  0.00576579, ..., -0.05365387,
         0.01319255,  0.02411219],
       [-0.05131149,  0.06214586, -0.00936322, ..., -0.04542533,
        -0.00452654,  0.02974594],
       [-0.04460508,  0.0756795 , -0.02917298, ..., -0.05203475,
         0.02210657,  0.00766674],
       ...,
       [-0.00969224,  0.10139713, -0.03568929, ..., -0.04014486,
         0.03997396,  0.04012393],
       [-0.00142297,  0.06637006, -0.02559609, ..., -0.05074168,
         0.04202404, -0.00560565],
       [-0.02384196,  0.06167114, -0.02783783, ..., -0.03634623,
         0.04027278, -0.01756093]], dtype=float32)

In [21]:
# Вычисляем попарно косинусное расстояние между векторами в матрице, округляем значения до 5 знака после запятой
cos_matrix_w2v = np.around(cosine_similarity(text_vectors), 5)
cos_matrix_w2v

array([[1.     , 0.924  , 0.87996, ..., 0.87295, 0.87711, 0.89696],
       [0.924  , 1.     , 0.94335, ..., 0.8839 , 0.88246, 0.88969],
       [0.87996, 0.94335, 1.     , ..., 0.86466, 0.83776, 0.85104],
       ...,
       [0.87295, 0.8839 , 0.86466, ..., 1.     , 0.86754, 0.85358],
       [0.87711, 0.88246, 0.83776, ..., 0.86754, 1.     , 0.94413],
       [0.89696, 0.88969, 0.85104, ..., 0.85358, 0.94413, 1.     ]],
      dtype=float32)

In [22]:
#уставим порог в 0.8
indexes_2 = np.transpose(np.nonzero(cos_matrix_w2v > 0.8))

In [23]:
# Выводим пары индексов продуктов с косинусным сходством больше 0.8 вместе с их описаниями
print("Product pairs with high cosine similarity:")
for idx1, idx2 in indexes:
    if idx1 < idx2:  # чтобы не дублировать пары
        print(f"Product {idx1} is similar to product {idx2} with cosine similarity {cos_matrix_w2v[idx1, idx2]}")
        print("Description of product", idx1, ":", df['description'].iloc[idx1])
        print("Description of product", idx2, ":", df['description'].iloc[idx2])
        print("\n")

Product pairs with high cosine similarity:
Product 3 is similar to product 158 with cosine similarity 0.997439980506897
Description of product 3 : alpine guide pants skin climb ice switch rock traverse knifeedge ridge boogie back durable weatherresistant breathable softshell pants keep stride every mountain endeavor midweight stretchwoven polyester wont restrict moves brushed interior maintains nexttoskin comfort way zippered fly keeps things easy harness gusseted zippered cuffs position snaps tiedown loops waterresistant zippered pockets two front two thigh one back hip recyclable common threads recycling program details durable stretchwoven polyester recycled dwr durable water repellent finish water windresistant highly breathable brushed interior nexttoskin comfort waistband belt loops elastic back way zippered fly external pockets two front slash two gluedon thigh one back hip highly waterresistant dwrfinished zippers gusseted zippered cuff position adjustable settings tiedown loop