In [167]:
import requests
import re
import string
import json
import random
import numpy as np
from fake_useragent import UserAgent
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from pymorphy2 import MorphAnalyzer
from bs4 import BeautifulSoup

In [28]:
session = requests.session()
ua = UserAgent(verify_ssl=False)

In [29]:
morph = MorphAnalyzer()

In [30]:
def find_names(s, db):  # find all the usernames  
    for review in s.find_all('div', {'class': 'company-reviews-list-item'}):  # foe every review
        # not all the reviews have a star-rating, so we check it
        if re.search(r'<div class="company-reviews-list-item-ratings">\n </div>', review.prettify()):  # if no rating, skip it
            continue
        name = review.find('div', {'class': 'company-reviews-list-item-name'}).text.strip()  # find the name of the review
        name = re.sub(r'\t+', ' ', name)
        if name in db_mts:
            # they are not usernames really, just names, so we create a random number for non-unique users
            name += '_' + str(random.randint(1, 1000))
            db[name] = []
        else:
            db[name] = []
    return db

In [31]:
def find_review(s, db):  # find all the text reviews
    for review in s.find_all('div', {'class': 'company-reviews-list-item'}):  
        if re.search(r'<div class="company-reviews-list-item-ratings">\n </div>', review.prettify()):  # checking if there is a rating
            continue
        for txt in review.find_all('div', {'class': 'company-reviews-list-item-text-message'}):  # all the texts
            for name in db:
                # there are pluses and minuses in review, so we collect both
                if len(db[name]) == 0 or len(db[name]) == 1:
                    db[name].append(txt.text.strip())
                    break
    return db

In [32]:
def find_labels(s):  # find labels of the star-grades
    labels = []
    for review in s.find_all('div', {'class': 'company-reviews-list-item'}):
        if re.search(r'<div class="company-reviews-list-item-ratings">\n </div>', review.prettify()):  # checking for rating
            continue
        for label in review.find_all('span', {'class': 'company-reviews-list-item-ratings-item-label'}):  # looking for labels of grading
            # could be 'Условия труда', 'Карьерный рост' and others
            label = label.text.strip()
            labels.append(label)
    labels = np.array(labels)  # put them in a special list
    return labels.reshape(-1, 1)


def find_ratings(s):  # find a rating for each label
    ratings = []
    for review in s.find_all('div', {'class': 'company-reviews-list-item'}):
        if re.search(r'<div class="company-reviews-list-item-ratings">\n </div>', review.prettify()):
            continue
        for rating in review.find_all('span', {'class': 'company-reviews-list-item-ratings-item-stars'}):
            rating = rating.get('data-rating')  # there are stars: from 1 to 5
            ratings.append(rating)
    ratings = np.array(ratings)
    return ratings.reshape(-1, 1)


def get_stars(lab, rat, db):
    stars = np.concatenate((lab, rat), axis=1)  # create an array 
    flag = 0  # there could be 3 or 5 criteria of grading (depending on job)
    dic = dict()
    for i in range(len(stars)):
        if flag:  # flag to skip the rest of criteria (they repeat)
            flag -= 1
            continue
        if 'Соц.пакет:' in stars[i]:  # 3 criteria starts with 'Соц.пакет'
            dic = dict()
            dic[stars[i][0]] = stars[i][1]
            dic[stars[i + 1][0]] = stars[i + 1][1]
            dic[stars[i + 2][0]] = stars[i + 2][1]
            flag = 2
        elif 'Коллектив:' in stars[i]:  # 5 criteria starts with 'Коллектив'
            dic = dict()
            dic[stars[i][0]] = stars[i][1]
            dic[stars[i + 1][0]] = stars[i + 1][1]
            dic[stars[i + 2][0]] = stars[i + 2][1]
            dic[stars[i + 3][0]] = stars[i + 3][1]
            dic[stars[i + 4][0]] = stars[i + 4][1]
            flag = 4
        for name in db:  # add the grades to the data
            if len(db[name]) == 2:
                db[name].append(dic)
                break
    return db

In [46]:
def splitter(db):  # to divide into bad reviews and good ones
    goods = []
    bads = []
    for item in db:
        items = list(map(int, db[item][-1].values()))  # grades into integers
        average = sum(items) / len(items)  # average grade for all the criteria
        if average > 3:  # more than 3 -> good
            text = db_mts[item][0] + '\n' + db[item][1]
            goods.append(text)
        elif average < 3:  # less than 3 -> bad
            text = db_mts[item][0] + '\n' + db[item][1]
            bads.append(text)
    if len(goods) > len(bads):  # make them equal
        goods = goods[:len(bads)]
    else:
        bads = bads[:len(goods)]
    with open('good_reviews.txt', 'w', encoding='utf-8') as f_good:
        f_good.write('\n'.join(goods))
    with open('bad_reviews.txt', 'w', encoding='utf-8') as f_bad:
        f_bad.write('\n'.join(bads))
    return goods, bads

In [34]:
def checker(word):  # checker for digits and punctuation
    for char in string.punctuation:
        if char in word:
            return False
    for char in string.digits:
        if char in word:
            return False
    return True

In [35]:
def tokenizer(text):  # tokenizer...
    tokenized = []
    for part in text:
        words = []
        tokenized_part = word_tokenize(part)  # tokenize
        for word in tokenized_part:
            if checker(word):
                word = word.lower()  # make lower
                word = morph.parse(word)[0].normal_form  # to normal form
                words.append(word)
        tokenized.append(words)
    return tokenized

In [79]:
def create_dic(tokenized_text, good_or_bad):  # frequency dictionary of the tokens
    good_tokens = dict()
    bad_tokens = dict()
    for index, part in enumerate(tokenized_text):
        for token in part:
            if good_or_bad[index] == 1:
                if token not in good_tokens:
                    good_tokens[token] = 1
                else:
                    good_tokens[token] += 1
            else:
                if token not in bad_tokens:
                    bad_tokens[token] = 1
                else:
                    bad_tokens[token] += 1
    good_tokens = dict(sorted(good_tokens.items(), key=lambda item: item[1], reverse=True))  # good dictionary sorting
    bad_tokens = dict(sorted(bad_tokens.items(), key=lambda item: item[1], reverse=True))  # bad dictionary sorting
    good_vals = set(good_tokens.keys())  # all the good tokens
    bad_vals = set(bad_tokens.keys())  # # all the bad tokens
    return good_tokens, bad_tokens, good_vals, bad_vals

In [37]:
def del_small(words, dictionary):  # delete non-frequent tokens
    real_words = []
    for word in words:
        if dictionary[word] > 2:  # occurrence is more than 2
            real_words.append(word)
    return real_words

#### Cкачиваем дату:

Искользуем краулер (всего 70 страниц, так как дальше начинаются выбросы)

In [38]:
db_mts = dict()
for i in range(2, 71):
    url = f'https://pravda-sotrudnikov.ru/company/mts-3?page={i}'
    req = session.get(url, headers={'User-Agent': ua.random})
    page = req.text
    soup = BeautifulSoup(page, 'html.parser')
    db_mts = find_names(soup, db_mts)
    db_mts = find_review(soup, db_mts)
    db_mts = get_stars(find_labels(soup), find_ratings(soup), db_mts)

Сохраняем полученную базу данных в файл типа JSON

In [42]:
with open('result.json', 'w', encoding='utf-8') as file:
    json.dump(db_mts, file, indent=4, ensure_ascii=False)

Делим отзывы на положительные и отрицательные (т.к. оценка колеблется между 1 и 5, 3 мы не учитываем)

In [53]:
good_reviews, bad_reviews = splitter(db_mts)

Создаем список всех отзывов и список "правильных" значений

In [62]:
X = good_reviews + bad_reviews
y = np.concatenate((np.full((len(good_reviews),), 1), np.full((len(bad_reviews),), 0)))

#### Токенизируем слова, приводим их к нижнему регистру и к начальной форме

In [66]:
X = tokenizer(X)

Делим выборку на тренировочную и тестовую

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

Создаем словари и множества слов, встречающихся в плохих и хороших отзывах:

In [80]:
good_dic, bad_dic, good_words, bad_words = create_dic(X_train, y_train)

#### Составляем 2 множества: 
в одном слова, которые встречаются только в положительных отзывах, а в другом - встречающиеся только в отрицательных

In [82]:
only_good = good_words.difference(bad_words)
only_bad = bad_words.difference(good_words)

#### Исключаем шум (слишком малочастотные слова):

In [83]:
really_bad = del_small(only_bad, bad_dic)
really_good = del_small(only_good, good_dic)

#### Создаем функцию, которая будет определять, положительный ли отзыв или отрицательный в зависимости от того, какие слова встретились в нём:

In [156]:
def good_or_bad_checker(test, good, bad):
    preds = []
    for review in test:
        for_good = 0
        for_bad = 0
        for token in review:
            if token in good:
                for_good += 1
            elif token in bad:
                for_bad += 1
        if for_good > for_bad:
            preds.append(1)
        elif for_bad > for_good:
            preds.append(0)
        else:
            preds.append(random.randint(0, 1))
    return preds

In [157]:
y_preds = good_or_bad_checker(X_test, really_good, really_bad)

#### Считаем качество при помощи *accuracy*:

In [158]:
accuracy_score(y_test, y_preds)

0.7410714285714286

#### 2 способа улучшить эту программу:
1. посмотреть не на токены, а на их сочетания (биграммы)
2. попробовать отследить все *не* и менять следующие слова на их антонимы
3. можно проставить веса и не убирать те вхождения, которые встречаются как в положительных, так и в отрицательных отзывах, но в одном типе намного чаще

Попробуем первый способ:

In [132]:
def create_dic_bigrams(tokenized_text, good_or_bad):  # frequency dictionary of the bigrams
    good_dic = dict()
    bad_dic = dict()
    for index, part in enumerate(tokenized_text):
        for token_id in range(len(part) - 1):
            bigram = part[token_id] + ' ' + part[token_id + 1]
            if good_or_bad[index] == 1:
                if bigram not in good_dic:
                    good_dic[bigram] = 1
                else:
                    good_dic[bigram] += 1
            else:
                if bigram not in bad_dic:
                    bad_dic[bigram] = 1
                else:
                    bad_dic[bigram] += 1
    good_dic = dict(sorted(good_dic.items(), key=lambda item: item[1], reverse=True))  # good dictionary sorting
    bad_dic = dict(sorted(bad_dic.items(), key=lambda item: item[1], reverse=True))  # bad dictionary sorting
    good_vals_bigrams = set(good_dic.keys())  # all the good bigrams
    bad_vals_bigrams = set(bad_dic.keys())  # # all the bad bigrams
    return good_dic, bad_dic, good_vals_bigrams, bad_vals_bigrams

In [133]:
good_dic_bigrams, bad_dic_bigrams, good_bigrams, bad_bigrams = create_dic_bigrams(X_train, y_train)

In [135]:
only_good_bigrams = good_bigrams.difference(bad_bigrams)
only_bad_bigrams = bad_bigrams.difference(good_bigrams)

In [138]:
really_bad_bigrams = del_small(only_bad_bigrams, bad_dic_bigrams)
really_good_bigrams = del_small(only_good_bigrams, good_dic_bigrams)

In [159]:
def good_or_bad_checker_bigrams(test, good, bad):
    preds = []
    for review in test:
        for_good = 0
        for_bad = 0
        for token_id in range(len(review) - 1):
            bigram = review[token_id] + ' ' + review[token_id + 1]
            if bigram in good:
                for_good += 1
            elif bigram in bad:
                for_bad += 1
        if for_good > for_bad:
            preds.append(1)
        elif for_bad > for_good:
            preds.append(0)
        else:
            preds.append(random.randint(0, 1))
    return preds

In [160]:
y_preds_bigram = good_or_bad_checker_bigrams(X_test, really_good_bigrams, really_bad_bigrams)

И... результат стал хуже (но немного). Возможно, потому что корпус недостаточно большой для биграмм.

In [161]:
accuracy_score(y_test, y_preds_bigram)

0.7232142857142857

Зато можно сделать лучше при помощи TF-IDF!

In [170]:
def preprocessing(texts):
    preprocessed = []
    for t in texts:
        tokens = word_tokenize(t)  # tokenize
        lemmatized = ' '.join([morph.parse(item)[0].normal_form for item in tokens if item.isalpha()])  # lemmatize
        preprocessed.append(lemmatized)
    return preprocessed

In [171]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [176]:
X_tfidf = good_reviews + bad_reviews

In [177]:
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.15, random_state=0)

In [178]:
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(preprocessing(X_train_tfidf))
X_test_tfidf = tfidf_vec.transform(preprocessing(X_test_tfidf))

In [180]:
clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train_tfidf)
y_preds_tfidf = clf.predict(X_test_tfidf)

In [181]:
accuracy_score(y_test_tfidf, y_preds_tfidf)

0.8660714285714286