In [1]:
import joblib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression

from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import f1_score, accuracy_score, ConfusionMatrixDisplay

from collections import Counter

import re
import string

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# from nltk.stem.snowball import SnowballStemmer
# stemmer = SnowballStemmer("russian") 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nikitabaramiya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# len('абвгдежзиклмнопрстухцчшщэюя')

In [3]:
# Xy_train_val.loc[Xy_train_val.target == '5', 'negative']

In [4]:
# [word if word.islower() else '' for word in word_tokenize(X_train_val.positive[0], 'russian')[1:]]

In [5]:
Xy_train_val = pd.read_csv('data/train.csv', index_col='review_id').fillna('Unknown')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1] # .apply(lambda x: int(x[0]))

mb = MultiLabelBinarizer(classes=[str(i) for i in range(9)])
y_train_val = mb.fit_transform(y_train_val)

X_test = pd.read_csv('data/test.csv', index_col='review_id').fillna('Unknown')

for data in [X_train_val, X_test]:
    
    # class 0: special symbol
    data['xa_symbol_pos'] = (data['positive'].str.find('\xa0') != -1).astype(int)
    data['xa_symbol_neg'] = (data['negative'].str.find('\xa0') != -1).astype(int)
    
    # small preprocessing
    data['positive'] = data['positive'].str.replace(',', ', ').str.replace('.', '. ').apply(lambda x: re.sub(' +', ' ', x))
    data['negative'] = data['negative'].str.replace(',', ', ').str.replace('.', '. ').apply(lambda x: re.sub(' +', ' ', x))
    
    # class 8: length (woith round -1)
    data['length_pos'] = data['positive'].apply(lambda x: round(len(x), -1)) # .str.len() also works
    data.loc[data['length_pos'] > 1000, 'length_pos'] = 1000
    data['length_neg'] = data['negative'].apply(lambda x: round(len(x), -1))
    data.loc[data['length_neg'] > 1000, 'length_neg'] = 1000
    
    # class 
    data['max_pos'] = data['positive'].apply(lambda x: np.max([len(w) for w in x.split(' ')]))
    data.loc[data['max_pos'] > 25, 'max_pos'] = 25
    data['max_neg'] = data['negative'].apply(lambda x: np.max([len(w) for w in x.split(' ')]))
    data.loc[data['max_neg'] > 25, 'max_neg'] = 25
    
    #
    data['most_common_pos'] = data['positive'].apply(
        lambda x: Counter([w for w in x.split(' ')]).most_common(1)[0][1]
    )
    data.loc[data['most_common_pos'] > 25, 'most_common_pos'] = 25
    data['most_common_neg'] = data['negative'].apply(
        lambda x: Counter([w for w in x.split(' ')]).most_common(1)[0][1]
    )
    data.loc[data['most_common_neg'] > 25, 'most_common_neg'] = 25
    
    for col in ['city', 'position']:
        counts = data[col].value_counts()
        data.loc[data[col].isin(counts[counts < 5].index), col] = 'Прочее'
        
    cols = ['salary_rating', 'team_rating', 'managment_rating', 
            'career_rating', 'workplace_rating', 'rest_recovery_rating']
    
    for i in range(1, 5+1):
        data[f'count_{i}'] = (data.loc[:, cols] == i).sum(axis=1)
    
#     rus_alph = 'абвгдежзиклмнопрстуфхцчшщэюя'.upper()
#     initials = [i + '.' + j for i in rus_alph for j in rus_alph]
    
#     data['initials_pos'] = data['positive'].apply(lambda x: any(x.find(i) != -1 for i in initials))
#     data['initials_neg'] = data['negative'].apply(lambda x: any(x.find(i) != -1 for i in initials))
    
    data['rating_mean'] = data.loc[:, cols].mean(axis=1)
    data['rating_std'] = data.loc[:, cols].std(axis=1)
    
#     data['positive'] = data['positive'].apply(
#         lambda x: re.sub(' +', ' ', ' '.join(
#             [stemmer.stem(word) if word not in string.punctuation else '' for word in word_tokenize(x)]
#         ))
#     )
#     data['negative'] = data['negative'].apply(
#         lambda x: re.sub(' +', ' ', ' '.join(
#             [stemmer.stem(word) if word not in string.punctuation else '' for word in word_tokenize(x)]
#         ))
#     )

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_train_val.shape, y_train_val.shape, X_test.shape

  data['positive'] = data['positive'].str.replace(',', ', ').str.replace('.', '. ').apply(lambda x: re.sub(' +', ' ', x))
  data['negative'] = data['negative'].str.replace(',', ', ').str.replace('.', '. ').apply(lambda x: re.sub(' +', ' ', x))


((50876, 25), (50876, 9), (50651, 25))

In [6]:
class DummyTransformer(TransformerMixin):
    """
    Mini class to return initial features without transformation
    
    """
    def __init__(self, value=None):
        TransformerMixin.__init__(self)
        self.value = value
    
    def fit(self, *_):
        return self

    def transform(self, X):
        return X
    
    def get_params(self, deep=True):
        return {'value': self.value}

In [10]:
pipeline = Pipeline([
    ('transforms', ColumnTransformer([
        ('ohe', OneHotEncoder(handle_unknown='ignore'), [0, 1] + [*range(4, X_val.shape[1]-2)]),
        ('two_features', DummyTransformer(), [X_val.shape[1]-2, X_val.shape[1]-1]),
        ('tfidf1', TfidfVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, 
                                   analyzer='char_wb'), 2),
        ('tfidf2', TfidfVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, 
                                   analyzer='char_wb'), 3),
        ('count1', CountVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, binary=True,
                                   analyzer='char_wb'), 2),
        ('count2', CountVectorizer(ngram_range=(1, 4), max_df=0.999, min_df=0.001, binary=True,
                                   analyzer='char_wb'), 3),
    ])),
    ('lr', OneVsRestClassifier(LogisticRegression(C=0.01, max_iter=500, n_jobs=-1, random_state=42)))
])
pipeline.fit(X_train_val, y_train_val)

Pipeline(steps=[('transforms',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  [0, 1, 4, 5, 6, 7, 8, 9, 10,
                                                   11, 12, 13, 14, 15, 16, 17,
                                                   18, 19, 20, 21, 22]),
                                                 ('two_features',
                                                  <__main__.DummyTransformer object at 0x7fcd91a50940>,
                                                  [23, 24]),
                                                 ('tfidf1',
                                                  TfidfVectorizer(analyzer='char_wb',
                                                                  max_df=0.999,
                                                                  min_df=0.001,
                                                      

In [11]:
joblib.dump(pipeline, 'model_18022022')

['model_18022022']

In [12]:
def predict_multilabel(model, X):
    y_pred = list(map(lambda x: ','.join(x), mb.inverse_transform(model.predict(X))))
    y_pred_top1 = model.predict_proba(X).argmax(axis=1)
    
    return np.where([len(x) > 0 for x in y_pred], y_pred, y_pred_top1)

In [14]:
model = joblib.load('model_18022022')

pd.DataFrame({
    'review_id': X_test.index, 
    'target': predict_multilabel(model, X_test) # model.predict(X_test).flatten()
}).to_csv('answers.csv', index=False)

In [15]:
# 0.811
# Pipeline(steps=[('transforms',
#                  ColumnTransformer(transformers=[('ohe',
#                                                   OneHotEncoder(handle_unknown='ignore'),
#                                                   [0, 1, 4, 5, 6, 7, 8, 9, 10,
#                                                    11, 12, 13, 14, 15, 16, 17,
#                                                    18, 19, 20, 21, 22]),
#                                                  ('two_features',
#                                                   <__main__.DummyTransformer object at 0x7fcd91a50940>,
#                                                   [23, 24]),
#                                                  ('tfidf1',
#                                                   TfidfVectorizer(analyzer='char_wb',
#                                                                   max_df=0.999,
#                                                                   min_df=0.001,
#                                                                   ngram_range=(1,
#                                                                                4)),
#                                                   2),
#                                                  ('tfidf2',
#                                                   TfidfV...
#                                                                   min_df=0.001,
#                                                                   ngram_range=(1,
#                                                                                4)),
#                                                   3),
#                                                  ('count1',
#                                                   CountVectorizer(analyzer='char_wb',
#                                                                   binary=True,
#                                                                   max_df=0.999,
#                                                                   min_df=0.001,
#                                                                   ngram_range=(1,
#                                                                                4)),
#                                                   2),
#                                                  ('count2',
#                                                   CountVectorizer(analyzer='char_wb',
#                                                                   binary=True,
#                                                                   max_df=0.999,
#                                                                   min_df=0.001,
#                                                                   ngram_range=(1,
#                                                                                4)),
#                                                   3)])),
#                 ('lr',
#                  OneVsRestClassifier(estimator=LogisticRegression(C=0.01,
#                                                                   max_iter=500,
#                                                                   n_jobs=-1,
#                                                                   random_state=42)))])
# 0.769645557967