In [1]:
import pandas as pd
import re
import pymorphy2
from nltk.tokenize import word_tokenize 
from gensim.models import Word2Vec


from tqdm import tqdm
tqdm.pandas()

In [2]:
path = '../TF-IDF/women-clothing-accessories.3-class.balanced.csv'
df = pd.read_csv(path, encoding="utf8", sep='\t')

In [3]:
df = df[df['sentiment'] != 'neautral']

In [4]:
# убрали знаки препинания
df['review_processed'] = df['review'].apply(lambda x: re.sub(r'[^\w\s]', '', x)).values

In [5]:
df['review_processed'] = df['review_processed'].progress_apply(lambda x: word_tokenize(x))

100%|██████████| 60000/60000 [00:06<00:00, 8890.45it/s] 


In [6]:
morph = pymorphy2.MorphAnalyzer()

In [7]:
df['review_lemmatized'] = df['review_processed'].progress_apply(lambda x: [morph.parse(w)[0].normal_form for w in x])

100%|██████████| 60000/60000 [03:02<00:00, 329.63it/s]


In [8]:
w2v_model = Word2Vec(min_count=5,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

In [9]:
sent = df['review_lemmatized'].values
w2v_model.build_vocab(sent, progress_per=1000)
w2v_model.train(sent, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(11524785, 35884950)

In [10]:
w2v_model.wv['качество']

array([-0.5673582 , -0.4235594 ,  0.6574695 , -0.07739633,  0.26882243,
        0.05577067, -0.48489583,  0.1176926 , -0.10880665, -0.23577942,
       -0.06415588, -0.8206807 , -0.7462041 ,  0.2386792 ,  0.53259796,
       -0.0106632 , -0.01083766,  0.38577062,  0.5862279 ,  0.02303083,
        0.1991865 ,  0.05275458, -0.6235246 , -0.52850085,  0.03812258,
        0.4045231 ,  0.08969192,  0.12912506, -0.2005748 , -0.08511876,
       -0.1182347 , -0.00876834,  0.09601021,  0.22756955, -0.6533377 ,
       -0.5079823 ,  0.02339345,  0.40496448, -0.58352417,  0.07745548,
        0.02597705, -0.52209204,  0.329115  ,  0.3435245 , -0.04041715,
        0.22901921, -1.3926383 , -0.00732074,  0.35793933,  0.24236599,
        0.3465896 , -0.37014413,  0.4644588 , -0.2240787 , -0.38615486,
       -0.17923155,  0.08022086,  0.19584914, -0.20136946, -0.47132868,
       -0.2339932 ,  0.5385101 , -0.08685377, -0.09517077, -0.6588447 ,
        0.13643958,  0.986363  , -0.1246378 ,  0.5732709 ,  0.00

In [11]:
w2v_model.wv.most_similar('ужасный')

[('отвратительный', 0.8528346419334412),
 ('плохой', 0.7408738136291504),
 ('ужас', 0.6534412503242493),
 ('ужастный', 0.6230824589729309),
 ('кошмарный', 0.6218550801277161),
 ('отвратный', 0.6166849732398987),
 ('жуткий', 0.5708467960357666),
 ('дешёвый', 0.5623089671134949),
 ('отстой', 0.5610623955726624),
 ('выброс', 0.5546766519546509)]

In [12]:
from scipy.spatial.distance import cosine

In [13]:
cosine(w2v_model.wv['хороший'], w2v_model.wv['ужасный'])

0.7229103744029999

In [14]:
cosine(w2v_model.wv['плохой'], w2v_model.wv['ужасный'])

0.2591262459754944

In [15]:
cosine(w2v_model.wv['отвратительный'], w2v_model.wv['ужасный'])

0.14716529846191406

## CatBoost

Сделать вектор для каждого слова

In [16]:
import numpy as np

In [17]:
# 1 - без TF-IDF (без учета важности слов)
# 2 - TF-IDF
variant = 1

In [18]:
# Вариант 1
if variant == 1:
    def get_text_vector(text):
        vecs = []
        for word in text:
            try:
                vecs.append(w2v_model.wv[word])
            except KeyError: # Для пропуска слов, которых нет в словаре
                continue
        
        return np.array(vecs).mean(axis=0)

In [19]:
# Вариант 2
if variant == 2:
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer()
    # Список слов в строку
    tf_idf_weights = vectorizer.fit_transform([', '.join(x) for x in df['review_lemmatized']])

In [20]:
if variant == 2:
    dictionary = dict(zip(vectorizer.get_feature_names_out(), vectorizer.idf_))
    dictionary['плохой']

In [21]:
if variant == 2:
    def get_text_vector(text):
        vecs = []
        for word in text:
            try:
                vecs.append(w2v_model.wv[word] * dictionary[word])
            except KeyError: # Для пропуска слов, которых нет в словаре
                continue
        
        return np.array(vecs).mean(axis=0)

In [22]:
df['vector'] = df['review_lemmatized'].progress_apply(get_text_vector)

  return np.array(vecs).mean(axis=0)
100%|██████████| 60000/60000 [00:02<00:00, 22993.24it/s]


Применим CatBoost

In [23]:
! pip install catboost




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

In [25]:
# пропустить, если не получились вектора
df = df[df['vector'].notna()]

In [26]:
# 2D Array -> 2D list
X = [list(x) for x in df['vector'].values]
# One Hot Encoding
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
catboost_model = CatBoostClassifier()

In [29]:
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=50)

Learning rate set to 0.082403
0:	learn: 0.6262545	test: 0.6262949	best: 0.6262949 (0)	total: 158ms	remaining: 2m 37s
50:	learn: 0.2450498	test: 0.2516616	best: 0.2516616 (50)	total: 1.42s	remaining: 26.4s
100:	learn: 0.2180560	test: 0.2308790	best: 0.2308790 (100)	total: 2.64s	remaining: 23.5s
150:	learn: 0.2021515	test: 0.2220587	best: 0.2220587 (150)	total: 3.91s	remaining: 22s
200:	learn: 0.1881478	test: 0.2167707	best: 0.2167707 (200)	total: 5.22s	remaining: 20.7s
250:	learn: 0.1762713	test: 0.2131131	best: 0.2131131 (250)	total: 6.53s	remaining: 19.5s
300:	learn: 0.1669222	test: 0.2112689	best: 0.2112689 (300)	total: 7.76s	remaining: 18s
350:	learn: 0.1578810	test: 0.2095917	best: 0.2095917 (350)	total: 8.99s	remaining: 16.6s
400:	learn: 0.1499806	test: 0.2081083	best: 0.2080756 (399)	total: 10.3s	remaining: 15.3s
450:	learn: 0.1428735	test: 0.2069936	best: 0.2069936 (450)	total: 11.8s	remaining: 14.4s
500:	learn: 0.1359917	test: 0.2061509	best: 0.2061509 (500)	total: 13.2s	remain

<catboost.core.CatBoostClassifier at 0x260c5617040>

In [30]:
y_pred = catboost_model.predict_proba(X_test)

In [31]:
from sklearn.metrics import roc_auc_score

In [32]:
roc_auc_score(y_test, y_pred[:, 1])

0.9743307056957566

roc_auc_score

с первой лабы: 0.9749

Вариант 1: 0.9752

Вариант 2: 0.9743