In [15]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import f1_score

In [16]:
stopwords = [
    'и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 
    'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 
    'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 
    'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 
    'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 
    'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 
    'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 
    'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 
    'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая',
    'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть',
    'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между'
]

In [17]:
Xy_train_val = pd.read_csv('data/train.csv', index_col='review_id').fillna('Unknown')
X_train_val, y_train_val = Xy_train_val.iloc[:, :-1], Xy_train_val.iloc[:, -1].apply(lambda x: int(x[0]))

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

X_test = pd.read_csv('data/test.csv', index_col='review_id').fillna('Unknown')

X_train_val.shape, y_train_val.shape, X_test.shape

((50876, 10), (50876,), (50651, 10))

In [4]:
X_train_val.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50876 entries, 0 to 192828
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   city                  50876 non-null  object
 1   position              50876 non-null  object
 2   positive              50876 non-null  object
 3   negative              50876 non-null  object
 4   salary_rating         50876 non-null  int64 
 5   team_rating           50876 non-null  int64 
 6   managment_rating      50876 non-null  int64 
 7   career_rating         50876 non-null  int64 
 8   workplace_rating      50876 non-null  int64 
 9   rest_recovery_rating  50876 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 4.3+ MB


In [5]:
city_counts = X_train_val.city.value_counts()
X_train_val.loc[X_train_val.city.isin(city_counts[city_counts < 3].index), 'city'] = 'Others'
X_test.loc[X_test.city.isin(city_counts[city_counts < 3].index), 'city'] = 'Others'

position_counts = X_train_val.position.value_counts()
X_train_val.loc[X_train_val.position.isin(position_counts[position_counts < 3].index), 'position'] = 'Others'
X_test.loc[X_test.position.isin(position_counts[position_counts < 3].index), 'position'] = 'Others'

In [6]:
# Xy_train_val.target.str.split(',', expand=True)
pipeline = Pipeline([
    ('transforms', ColumnTransformer([
        ('ohe', OneHotEncoder(handle_unknown='ignore'), [0, 1, 4, 5, 6, 7, 8, 9]),
        ('tfidf1', TfidfVectorizer(ngram_range=(1, 3), max_df=0.999, min_df=0.001), 2),
        ('tfidf2', TfidfVectorizer(ngram_range=(1, 3), max_df=0.999, min_df=0.001), 3)])),
    ('lr', LogisticRegressionCV(Cs=[0.0001, 0.001, 0.01], max_iter=200, n_jobs=-1, random_state=42))
])
pipeline.fit(X_train_val, y_train_val)

In [7]:
pipeline = Pipeline([
    ('transforms', ColumnTransformer([
        ('ohe', OneHotEncoder(handle_unknown='ignore'), [0, 1, 4, 5, 6, 7, 8, 9]),
        ('tfidf1', TfidfVectorizer(max_df=0.99, min_df=0.01), 2),
        ('tfidf2', TfidfVectorizer(max_df=0.99, min_df=0.01), 3)])),
    ('lr', LogisticRegressionCV(Cs=[0.1, 1, 10], max_iter=200, n_jobs=-1, random_state=42))
])

model_lr = pipeline.fit(X_train_val, y_train_val)

Pipeline(steps=[('transforms',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  [0, 1, 4, 5, 6, 7, 8, 9]),
                                                 ('tfidf1',
                                                  TfidfVectorizer(max_df=0.999,
                                                                  min_df=0.001,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  2),
                                                 ('tfidf2',
                                                  TfidfVectorizer(max_df=0.999,
                                                                  min_df=0.001,
                                                                  ngram_range=(1,
   

In [8]:
import joblib
joblib.dump(pipeline, 'model_12022022')

['model_12022022']

In [9]:
model = joblib.load('model_12022021')

pd.DataFrame({
    'review_id': X_test.index, 
    'target': model.predict(X_test).flatten()
}).to_csv('answers.csv', index=False)

In [10]:
model = joblib.load('model_11022022 2')

In [14]:
model['lr'].C_

array([1., 1., 1., 1., 1., 1., 1., 1., 1.])