In [1]:
!pip install pandas scikit-learn tqdm ipywidgets sentence-transformers xgboost catboost lightgbm>pip.log

In [None]:
import numpy as np
import pandas as pd
import joblib
import locale
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier

from sentence_transformers import SentenceTransformer

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

locale.getpreferredencoding = lambda: "UTF-8"

tqdm.pandas()


In [None]:
random_state = 121

In [None]:
from google.colab import drive
drive.mount('/content/drive', True)

DATA_DIR = '/content/drive/MyDrive/aeroclub/track1/data'
SUBM_DIR = '/content/drive/MyDrive/aeroclub/track1/submission'
MODELS_DIR = '/content/drive/MyDrive/aeroclub/track1/models'

# !ls {DATA_DIR}

Mounted at /content/drive


In [None]:
def mark_label_not_request(df):
    '''Функция отметки автоматически генерируемых сообщений
    
    Параметры:
    ----------
    df_train : DataFrame
        датафрейм с исходными сообщениями
    
    Возращаемые значения:
    ---------------------
    df_train : DataFrame
        датафрейм с заполненным столбцом 0\1 в label

    '''
    df.loc[
        df['title'].str.contains(
            '^Электронный билет:', na=False,
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^Ваучер к заказу', na=False,)
        ) & (
            df['text'].str.contains('^Уважаемый \w+! Благодарим Вас за обращение в компанию Аэроклуб', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        df['title'].str.contains(
            '^Подтверждение бронирования №', na=False,
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^i’way для', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^Оповещение ASIM по заказу', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^Поставщик внес изменения в стоимость брон', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        df['title'].str.contains(
            '^Please purchase tickets according to the booking', na=False,
        ), 'label'
    ] = 1

    df.loc[
        (
            df['title'].str.contains('^Voucher for order', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        df['title'].str.contains(
            '^Прошу оформить билеты по бронированию', na=False,
        ), 'label'
    ] = 1

    df.loc[
        (
            df['title'].str.contains('^\[ Aeroclub.*Сообщение от Аэроклуб АО', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^\[ Aeroclub', na=False,)
        ) & (
            df['text'].str.contains('оформил услугу', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^\[ Aeroclub', na=False,)
        ) & (
            df['text'].str.contains('пытался забронировать услугу', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^\[ Aeroclub', na=False,)
        ) & (
            df['text'].str.contains('просит оформить услугу', na=False,)
        ), 'label'
    ] = 1

    df.loc[
        (
            df['title'].str.contains('^\[ Aeroclub', na=False,)
        ) & (
            df['text'].str.contains('пытался произвести изменение услуги', na=False,)
        ), 'label'
    ] = 0

    df.loc[
        (
            df['title'].str.contains('^\[ Aeroclub', na=False,)
        ) & (
            df['text'].str.contains('пытался произвести отмена услуги', na=False,)
        ), 'label'
    ] = 0
    
    df.loc[
        (
            df['title'].str.contains('^\[ Aeroclub', na=False,)
        ) & (
            df['text'].str.contains('пытался произвести бронирование услуги', na=False,)
        ), 'label'
    ] = 0
    
    df.loc[
        df['text'].str.contains(
            '^Здравствуйте, aeroclubXML', na=False,
        ), 'label'
    ] = 0
    return df


In [None]:
# df_score = pd.DataFrame([], columns=['model', 'vectorizer', 'preprocessing', 'threshold', 'f1_macro', 'f1_weighted'])
# df_score.to_csv(f'{SUBM_DIR}/score.csv', index=False)

df_score = pd.read_csv(f'{SUBM_DIR}/score.csv')

df_score.sort_values(by='f1_macro', ascending=False)


Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
48,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
47,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
45,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
44,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
52,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
...,...,...,...,...,...,...
35,catb,minilm,title_payload_lemmas,0.7,0.378789,0.395296
71,catb,distilbert,title_payload_lemmas,0.8,0.344692,0.362616
73,catb_embs,distilbert,title_payload_lemmas,0.8,0.344692,0.362616
39,catb_embs,minilm,title_payload_lemmas,0.8,0.344692,0.362616


### Raw Data

In [None]:
usecols = [
    'id',
    'title',
    'text',
    'payload',
    'title_lemmas',
    'payload_lemmas'
]

In [None]:
df_raw = pd.read_csv(f'{DATA_DIR}/traind_data_preprocessed.csv', usecols=usecols)
df_raw.head(2)

Unnamed: 0,id,title,text,payload,title_lemmas,payload_lemmas
0,0,[ Aeroclub NAME NAME NAME PASSPORT/409285. Соо...,Здравствуйте! NAME Командировка PASSPORT/4092...,здравствуйте! name командировка passport/4092...,[ aeroclub name name name passport/409285 . со...,здравствуйте ! name командировка passport/40...
1,1,[ Aeroclub NAME NAME NAME PASSPORT/409285. Соо...,Здравствуйте! NAME Командировка PASSPORT/4092...,здравствуйте! name командировка passport/4092...,[ aeroclub name name name passport/409285 . со...,здравствуйте ! name командировка passport/40...


In [None]:
df_raw['label'] = np.nan
df_raw = mark_label_not_request(df_raw)
df_raw['label'].value_counts()

0.0    1519
1.0     511
Name: label, dtype: int64

### Train Data

In [None]:
usecols = [
    'id',
    'title',
    'text',
    'payload',
    'title_lemmas',
    'payload_lemmas',
    'label'
]

In [None]:
df_labeled = pd.read_csv(f'{DATA_DIR}/labeled_data_preprocessed_v2.csv', usecols=usecols)
df_labeled.head(2)

Unnamed: 0,id,label,title,text,payload,title_lemmas,payload_lemmas
0,27,0,[ Aeroclub NAME NAME NAME PASSPORT. Выписаны б...,"Здравствуйте! Труб NAME NAME ( "" КИВИ "" ) офо...","здравствуйте! труб name name ( "" киви "" ) офо...",[ aeroclub name name name passport . выписать ...,"здравствуйте ! труба name name ( "" киви "" ) ..."
1,139,1,[ Aeroclub NAME NAME NAME PASSPORT. Оформление...,"Good day! Игнатьева NAME NAME ( "" КИВИ БАНК ""...","good day! игнатьева name name ( "" киви банк ""...",[ aeroclub name name name passport . оформлени...,"good day ! игнатьев name name ( "" киви банк ..."


In [None]:
df_supervised = pd.read_csv(f'{DATA_DIR}/supervised_data_preprocessed.csv', usecols=usecols)
df_supervised.head(2)

Unnamed: 0,id,label,title,text,payload,title_lemmas,payload_lemmas
0,9,0,Re: RE: NAME NAME,"PASSPORT С уважением/ NAME regards, NAME NAME ...",passport,re : re : name name,passport
1,11,1,RE: Подтверждение бронирования: 09.10. NAME NA...,Добрый день! NAME Подтверждаю! ---------------...,добрый день! name подтверждаю! ---------------...,re : подтверждение бронирование : 09.10 . name...,добрый день ! name подтверждать ! ------------...


In [None]:
df_heuristic = df_raw[df_raw['label'].notna()].copy()
df_heuristic.head(2)

Unnamed: 0,id,title,text,payload,title_lemmas,payload_lemmas,label
1,1,[ Aeroclub NAME NAME NAME PASSPORT/409285. Соо...,Здравствуйте! NAME Командировка PASSPORT/4092...,здравствуйте! name командировка passport/4092...,[ aeroclub name name name passport/409285 . со...,здравствуйте ! name командировка passport/40...,0.0
2,2,[ Aeroclub NAME ] Командировка PASSPORT/339029...,Здравствуйте! NAME Командировка PASSPORT/3390...,здравствуйте! name командировка passport/3390...,[ aeroclub name ] командировка passport/339029...,здравствуйте ! name командировка passport/33...,0.0


In [None]:
df_train = pd.concat((df_supervised, df_labeled, df_heuristic), axis=0)
df_train.shape

(6176, 7)

In [None]:
df_dupl = df_train \
    .loc[:, 'id'] \
    .value_counts() \
    .rename('count') \
    .rename_axis('id') \
    .reset_index() \
    .loc[lambda df: df['count'] > 1, ['id']]

df_train = df_train.drop_duplicates()
df_train = df_train[~df_train['id'].isin(df_dupl['id'].unique())].copy()

df_train.shape

(3356, 7)

In [None]:
df_validate = pd.read_csv(f'{DATA_DIR}/manual_check_data.csv', usecols=['id', 'label'])
df_validate.head(2)

Unnamed: 0,id,label
0,11000,0
1,11001,0


In [None]:
usecols=[
    'id',
    'title',
    'text',
    'payload',
    'title_lemmas',
    'payload_lemmas'
  ]

df_subm = pd.read_csv(f'{DATA_DIR}/test_data_preprocessed.csv', usecols=usecols)
df_subm.head(2)

Unnamed: 0,id,title,text,payload,title_lemmas,payload_lemmas
0,11000,"RE: 555-PASSPORT, NAME NAME","NAME NAME. Для дополнительных консультаций, п...","name name. для дополнительных консультаций, п...","re : 555-passport , name name",name name . для дополнительный консультация ...
1,11001,Анапа,# # # write your NAME NAME NAME line # # #:: 5...,# # # write your name name name line # # #:: 5...,анапа,# # # write your name name name line # # # : :...


In [None]:
vectorizer_name = 'distilbert'

# vectorizer = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

vectorizer = SentenceTransformer('multi-qa-distilbert-cos-v1')

vectorizer

Downloading (…)6be28/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)ed8ac6be28/README.md:   0%|          | 0.00/9.46k [00:00<?, ?B/s]

Downloading (…)8ac6be28/config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e28/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)6be28/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)be28/train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading (…)ed8ac6be28/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)ac6be28/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [None]:
preprocessing_name = 'title_payload'

docs = df_raw['title'].fillna('')
title_embs = vectorizer.encode(docs, device='cuda', show_progress_bar=True)

docs = df_raw['payload'].fillna('')
text_embs = vectorizer.encode(docs, device='cuda', show_progress_bar=True)

embs = np.hstack((title_embs, text_embs))
embs


RuntimeError: ignored

In [None]:
joblib.dump(embs, f'{MODELS_DIR}/embs_{preprocessing_name}_{vectorizer_name}.pkl')

['/content/drive/MyDrive/aeroclub/track1/models/embs_title_payload_lemmas_distilbert.pkl']

In [None]:
# embs = joblib.load(f'{MODELS_DIR}/embs_{preprocessing_name}_{vectorizer_name}.pkl')

df_embs = pd.DataFrame(embs, index=df_raw.index).reset_index().rename(columns={'index':'id'})
df_embs

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0,-0.000660,0.010459,-0.024979,0.044324,0.045259,0.046740,0.027080,-0.019950,0.040545,...,-0.008098,0.002827,0.048593,-0.005414,-0.013393,-0.041510,-0.006512,0.001261,0.018385,0.019423
1,1,-0.023946,0.043490,-0.034092,0.052313,0.047276,0.054504,0.048971,-0.023558,0.028155,...,-0.001482,-0.004101,0.062340,-0.015668,-0.022232,-0.052404,-0.008568,0.004709,0.029622,-0.000775
2,2,-0.016393,0.009827,-0.021851,0.067195,0.042506,0.065299,0.019617,-0.029777,0.025360,...,-0.005994,-0.012913,0.049828,-0.022795,-0.016050,-0.061005,-0.007540,0.004093,0.027477,-0.005406
3,3,0.023516,-0.021061,-0.017825,0.057704,0.048553,0.047859,0.018190,-0.013023,0.026128,...,0.031579,0.010917,0.029122,-0.028626,-0.019451,-0.042499,-0.017027,-0.008394,0.016380,0.011629
4,4,-0.008371,0.000508,-0.032135,0.072341,0.041259,0.030425,-0.023947,-0.005447,0.028713,...,0.002766,-0.005955,0.023868,-0.005443,-0.022547,-0.029730,-0.006018,0.017183,-0.019345,0.010175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14029,14029,0.052169,0.011750,0.034000,-0.047898,0.000551,0.081423,-0.046695,0.027360,0.003819,...,0.002651,-0.000558,0.008088,-0.039416,0.044329,-0.008072,0.016731,0.002279,-0.030133,0.016689
14030,14030,0.033787,0.053115,0.043550,-0.018385,-0.012992,0.049283,-0.031594,0.058446,0.041454,...,0.016041,0.010653,0.007400,-0.049316,0.053615,0.015312,0.017015,0.011115,-0.002205,0.064096
14031,14031,-0.010684,0.009177,-0.027292,-0.016646,0.058095,0.045803,0.018236,0.037594,-0.071650,...,0.036025,-0.061648,0.020888,-0.038054,0.014169,-0.018636,-0.023726,-0.019464,0.023394,0.053365
14032,14032,-0.010684,0.009176,-0.027292,-0.016646,0.058095,0.045803,0.018236,0.037594,-0.071650,...,0.020922,-0.065769,0.028144,-0.051006,-0.002445,0.003706,-0.005280,-0.052211,0.014604,0.040542


In [None]:
subm_docs = df_subm['title'].fillna('')
subm_title_embs = vectorizer.encode(subm_docs, device='cuda', show_progress_bar=True)

subm_docs = df_subm['payload'].fillna('')
subm_text_embs = vectorizer.encode(subm_docs, device='cuda', show_progress_bar=True)

subm_embs = np.hstack((subm_title_embs, subm_text_embs))
subm_embs

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

array([[-0.00407336, -0.01220315,  0.05067679, ...,  0.02323276,
         0.03284496,  0.04215266],
       [-0.03697329,  0.02320122, -0.05575564, ...,  0.00872342,
         0.01617701,  0.03785551],
       [-0.02865267,  0.00691325, -0.06292005, ...,  0.00815995,
         0.04410923,  0.0074641 ],
       ...,
       [-0.00064111,  0.00175678, -0.03590047, ..., -0.00482143,
         0.0435864 ,  0.01513114],
       [ 0.00819043,  0.00453465,  0.00768435, ..., -0.0001066 ,
         0.04533621, -0.01164281],
       [-0.01663622,  0.01340159, -0.00188521, ...,  0.0113745 ,
        -0.00996696, -0.00249654]], dtype=float32)

In [None]:
joblib.dump(subm_embs, f'{MODELS_DIR}/subm_embs_{preprocessing_name}_{vectorizer_name}.pkl')

['/content/drive/MyDrive/aeroclub/track1/models/subm_embs_title_payload_lemmas_distilbert.pkl']

In [None]:
# subm_embs = joblib.load(f'{MODELS_DIR}/subm_embs_{preprocessing_name}_{vectorizer_name}.pkl')

df_subm_embs = pd.DataFrame(subm_embs, index=df_subm.index)
df_subm_embs['id'] = df_subm['id']
df_subm_embs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1527,1528,1529,1530,1531,1532,1533,1534,1535,id
0,-0.004073,-0.012203,0.050677,0.025576,0.059241,0.008467,-0.010143,-0.008856,-0.026572,0.028668,...,-0.061996,0.034201,-0.043009,0.001445,-0.041360,0.035499,0.023233,0.032845,0.042153,11000
1,-0.036973,0.023201,-0.055756,-0.002733,0.013325,0.082250,0.051447,0.030558,-0.046615,0.014341,...,-0.031399,0.020885,-0.026360,0.005862,-0.028339,0.009574,0.008723,0.016177,0.037856,11001
2,-0.028653,0.006913,-0.062920,-0.002585,0.016764,0.105523,0.062984,0.062277,-0.069757,0.010407,...,-0.005695,0.012455,-0.035297,-0.008144,-0.033897,0.008590,0.008160,0.044109,0.007464,11002
3,-0.008553,0.012899,-0.055113,0.005189,0.021667,0.095209,0.038985,0.063014,-0.050502,0.026928,...,0.011266,-0.002303,-0.010164,-0.025666,-0.075283,-0.022537,-0.002051,0.000769,0.030497,11003
4,0.043682,-0.040096,0.007180,-0.039543,0.010707,0.077056,-0.018370,0.013206,-0.029942,0.069438,...,-0.012798,0.002564,0.031288,-0.009479,-0.030530,0.024517,0.006772,0.001391,0.041823,11004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.008090,-0.001957,-0.006487,-0.017166,0.025428,0.025543,0.053870,0.011056,-0.065764,0.013010,...,-0.044741,0.013540,-0.020910,-0.021629,-0.018361,-0.022590,-0.000774,0.035897,0.030763,11995
996,-0.049575,-0.001624,-0.024191,0.003019,0.026374,0.073921,-0.011280,0.017816,-0.077440,0.055900,...,-0.062220,0.015022,-0.043470,-0.007632,-0.033293,0.019948,-0.028322,0.044345,0.024597,11996
997,-0.000641,0.001757,-0.035900,-0.007081,0.023613,0.057346,0.044859,0.043070,-0.005091,0.015902,...,-0.038701,0.001543,-0.011135,0.009779,-0.066958,0.015948,-0.004821,0.043586,0.015131,11997
998,0.008190,0.004535,0.007684,0.017114,0.071184,0.073524,-0.053281,-0.006419,-0.040330,0.041474,...,-0.015691,0.047763,0.006816,-0.036505,-0.068316,-0.022675,-0.000107,0.045336,-0.011643,11998


In [None]:
X_train = df_train[['id']].merge(df_embs, how='inner', on='id').drop(columns='id')
y_train = df_train['label']

(X_train.shape, y_train.shape)

((3356, 1536), (3356,))

In [None]:
X_val = df_subm[['id']].merge(df_subm_embs, how='inner', on='id').drop(columns='id')
y_val = df_validate['label']

X_val.shape

(1000, 1536)

In [None]:
def validate_model_preds(X_val, y_val, model, model_name, mark_labels=False, df_subm=df_subm):
    global df_score

    preds = model.predict(X_val)
    df_subm[f'label_{model_name}'] = preds

    score_macro = f1_score(y_val, preds, average='macro')
    score_weighted = f1_score(y_val, preds, average='weighted')

    row = [model_name, vectorizer_name, preprocessing_name, 0.0, score_macro, score_weighted]
    df_score = pd.concat((df_score, pd.DataFrame([row], columns=df_score.columns)), axis=0)
    df_score = df_score.drop_duplicates()

    df_result= df_subm[['id', 'title', 'text', f'label_{model_name}']] \
        .rename(columns={f'label_{model_name}': 'label'})

    if mark_labels:
        df_result = mark_label_not_request(df_result)

    df_result.to_csv(f'{SUBM_DIR}/{vectorizer_name}_{model_name}_{preprocessing_name}_{score_macro:0.3}.csv', index=False)

    print(classification_report(y_val, df_result['label']))


def validate_model_proba(X_val, y_val, model, model_name, threshold, mark_labels=False, df_subm=df_subm):
    global df_score

    proba = model.predict_proba(X_val)[:,1]
    preds = np.where(proba > threshold, 1, 0)
    df_subm[f'label_{model_name}'] = preds

    score_macro = f1_score(y_val, preds, average='macro')
    score_weighted = f1_score(y_val, preds, average='weighted')

    row = [model_name, vectorizer_name, preprocessing_name, threshold, score_macro, score_weighted]
    df_score = pd.concat((df_score, pd.DataFrame([row], columns=df_score.columns)), axis=0)
    df_score = df_score.drop_duplicates()

    df_result= df_subm[['id', 'title', 'text', f'label_{model_name}']] \
        .rename(columns={f'label_{model_name}': 'label'})

    if mark_labels:
        df_result = mark_label_not_request(df_result)

    df_result.to_csv(f'{SUBM_DIR}/{vectorizer_name}_{model_name}_{preprocessing_name}_{score_macro:0.3}.csv', index=False)

    print(classification_report(y_val, df_result['label']))



In [None]:
class_weights = y_train.value_counts().to_dict()
class_weights

{1.0: 2134, 0.0: 1222}

## SVM - Support Vectors Machine

In [None]:
# train
hinge = SGDClassifier(random_state=random_state, max_iter=2000)
hinge.fit(X_train, y_train)


In [None]:
validate_model_preds(X_val, y_val, hinge, 'hinge')

df_score.sort_values(by='f1_macro', ascending=False).head(20)

              precision    recall  f1-score   support

           0       0.90      0.37      0.52       526
           1       0.58      0.96      0.72       474

    accuracy                           0.65      1000
   macro avg       0.74      0.66      0.62      1000
weighted avg       0.75      0.65      0.62      1000



Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,ranfor,minilm,title_payload_lemmas,0.7,0.718527,0.717927


### Logistic Regression

In [None]:
logreg = SGDClassifier(loss='log_loss', random_state=random_state)
logreg.fit(X_train, y_train)

In [None]:
validate_model_proba(X_val, y_val, logreg, 'logreg', threshold=0.6)

df_score.sort_values(by='f1_macro', ascending=False).head(20)

              precision    recall  f1-score   support

           0       0.86      0.50      0.63       526
           1       0.62      0.91      0.74       474

    accuracy                           0.69      1000
   macro avg       0.74      0.70      0.68      1000
weighted avg       0.75      0.69      0.68      1000



Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,ranfor,minilm,title_payload_lemmas,0.7,0.718527,0.717927


### Random Forest

In [None]:
ranfor = RandomForestClassifier(random_state=random_state)
ranfor.fit(X_train, y_train)

In [None]:
validate_model_proba(X_val, y_val, ranfor, 'ranfor', threshold=0.8)

df_score.sort_values(by='f1_macro', ascending=False).head(20)

              precision    recall  f1-score   support

           0       0.65      0.78      0.71       526
           1       0.68      0.53      0.60       474

    accuracy                           0.66      1000
   macro avg       0.67      0.65      0.65      1000
weighted avg       0.67      0.66      0.65      1000



Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,catb_embs,distilbert,title_payload_lemmas,0.6,0.718993,0.71892


### XGBoost

In [None]:
xgb = XGBClassifier(random_state=random_state)
xgb.fit(X_train, y_train)

In [None]:
validate_model_proba(X_val, y_val, xgb, 'xgb', threshold=0.5)

df_score.sort_values(by='f1_macro', ascending=False).head(20)

              precision    recall  f1-score   support

           0       0.94      0.33      0.49       526
           1       0.57      0.97      0.72       474

    accuracy                           0.64      1000
   macro avg       0.75      0.65      0.60      1000
weighted avg       0.76      0.64      0.60      1000



Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,catb_embs,distilbert,title_payload_lemmas,0.6,0.718993,0.71892


### CatBoost

In [None]:
catb = CatBoostClassifier(random_state=random_state, task_type='GPU', verbose=False)
catb.fit(X_train, y_train, eval_set=(X_val, y_val))


<catboost.core.CatBoostClassifier at 0x7fb075501030>

In [None]:
validate_model_proba(X_val, y_val, catb, 'catb', threshold=0.8)

df_score.sort_values(by='f1_macro', ascending=False).head(20)

              precision    recall  f1-score   support

           0       0.53      1.00      0.69       526
           1       0.00      0.00      0.00       474

    accuracy                           0.53      1000
   macro avg       0.26      0.50      0.34      1000
weighted avg       0.28      0.53      0.36      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,catb_embs,distilbert,title_payload_lemmas,0.6,0.718993,0.71892


### CatBoost with Embeddings

In [None]:
X_train_emb = X_train.apply(list, axis=1).rename('embs').to_frame()
X_val_emb = X_val.apply(list, axis=1).rename('embs').to_frame()

In [None]:
catb_embs = CatBoostClassifier(random_state=random_state, embedding_features=['embs'], task_type='GPU', verbose=False)
catb_embs.fit(X_train_emb, y_train, eval_set=(X_val_emb, y_val))


<catboost.core.CatBoostClassifier at 0x7fb075501ab0>

In [None]:
validate_model_proba(X_val_emb, y_val, catb_embs, 'catb_embs', threshold=0.8)

df_score.sort_values(by='f1_macro', ascending=False).head(20)

              precision    recall  f1-score   support

           0       0.53      1.00      0.69       526
           1       0.00      0.00      0.00       474

    accuracy                           0.53      1000
   macro avg       0.26      0.50      0.34      1000
weighted avg       0.28      0.53      0.36      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,catb_embs,distilbert,title_payload_lemmas,0.6,0.718993,0.71892


### LightGBM

In [None]:
lgbm = LGBMClassifier(random_state=random_state)
lgbm.fit(X_train, y_train)

In [None]:
validate_model_proba(X_val, y_val, lgbm, 'lgbm', threshold=0.8)

df_score.sort_values(by='f1_macro', ascending=False).head(20)


              precision    recall  f1-score   support

           0       0.90      0.49      0.64       526
           1       0.62      0.94      0.75       474

    accuracy                           0.70      1000
   macro avg       0.76      0.72      0.69      1000
weighted avg       0.77      0.70      0.69      1000



Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,catb_embs,distilbert,title_payload_lemmas,0.6,0.718993,0.71892


### Ensemble

In [None]:
mean_columns = [
    'label_hinge',
    'label_logreg',
    'label_ranfor',
    'label_xgb',
    'label_lgbm',
    'label_catb',
    'label_catb_embs'
]
df_results = df_subm[mean_columns]

df_results.corr()

Unnamed: 0,label_hinge,label_logreg,label_ranfor,label_xgb,label_lgbm,label_catb,label_catb_embs
label_hinge,1.0,0.777067,0.38547,0.671725,0.648122,,
label_logreg,0.777067,1.0,0.470666,0.609944,0.686662,,
label_ranfor,0.38547,0.470666,1.0,0.362413,0.4776,,
label_xgb,0.671725,0.609944,0.362413,1.0,0.711874,,
label_lgbm,0.648122,0.686662,0.4776,0.711874,1.0,,
label_catb,,,,,,,
label_catb_embs,,,,,,,


In [None]:
threshold_ensemble = 0.7


mean_col = df_results.mean(axis=1)
mean_col[mean_col < threshold_ensemble] = 0
mean_col[mean_col >= threshold_ensemble] = 1
mean_col = mean_col.astype(int)

df_subm['label_ensebmle'] = mean_col

score_macro = f1_score(df_validate['label'], df_subm['label_ensebmle'], average='macro')
score_weighted = f1_score(df_validate['label'], df_subm['label_ensebmle'], average='weighted')

model_name = ' '.join([col[6:] for col in mean_columns])
row = [model_name, vectorizer_name, preprocessing_name, threshold_ensemble, score_macro, score_weighted]
df_score = pd.concat((df_score, pd.DataFrame([row], columns=df_score.columns)), axis=0)
df_score = df_score.drop_duplicates()

print(classification_report(df_validate['label'], df_subm['label_ensebmle']))

df_score.sort_values(by='f1_macro', ascending=False).head(20)

              precision    recall  f1-score   support

           0       0.64      0.79      0.71       526
           1       0.69      0.52      0.59       474

    accuracy                           0.66      1000
   macro avg       0.67      0.65      0.65      1000
weighted avg       0.66      0.66      0.65      1000



Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,catb_embs,distilbert,title_payload_lemmas,0.6,0.718993,0.71892


### Score

In [None]:
df_score.to_csv(f'{SUBM_DIR}/score.csv', index=False)

df_score.sort_values(by='f1_macro', ascending=False).head(20)

Unnamed: 0,model,vectorizer,preprocessing,threshold,f1_macro,f1_weighted
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb_embs,minilm,title_payload_lemmas,0.8,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.735023,0.733562
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.735023,0.733562
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.7,0.732593,0.733136
0,hinge logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.8,0.732593,0.733136
21,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload,0.5,0.730248,0.728832
0,logreg ranfor xgb lgbm catb catb_embs,minilm,title_payload_lemmas,0.6,0.728192,0.726312
0,catb_embs,distilbert,title_payload_lemmas,0.6,0.718993,0.71892


In [None]:
df_subm[['id', 'title', 'text', 'label_ensebmle']] \
    .rename(columns={'label_ensebmle':'label'}) \
    .to_csv(f'{SUBM_DIR}/subm_ensemble_minilm_title_payload_lemmas.csv', index=False)

In [None]:
df_subm.columns

Index(['id', 'title', 'text', 'payload', 'title_lemmas', 'payload_lemmas',
       'label_hinge', 'label_logreg', 'label_ranfor', 'label_xgb',
       'label_catb', 'label_catb_embs', 'label_lgbm'],
      dtype='object')

In [None]:
mean_col = df_submission[['label_hinge', 'label_logreg', 'label_ranfor', 'label_xgb', 'label_catb', 'label_lgbm']].mean(axis=1)
mean_col[mean_col < 0.5] = 0
mean_col[mean_col > 0.5] = 1
mean_col = mean_col.astype(np.int)
mean_col

df_submission['label_mean_all'] = mean_col

# df_submission[['id', 'title', 'text', 'label_mean_all']] \
#     .rename(columns={'label_mean_all': 'label'}) \
#     .to_csv(f'{SUBM_DIR}/spacy_mean_all.csv', index=False)

print(classification_report(df_validate['label'], df_submission['label_mean_all']))

#               precision    recall  f1-score   support

#            0       0.82      0.40      0.54       526
#            1       0.58      0.91      0.70       474

#     accuracy                           0.64      1000
#    macro avg       0.70      0.65      0.62      1000
# weighted avg       0.71      0.64      0.62      1000

              precision    recall  f1-score   support

           0       0.93      0.44      0.60       526
           1       0.61      0.96      0.75       474

    accuracy                           0.69      1000
   macro avg       0.77      0.70      0.67      1000
weighted avg       0.78      0.69      0.67      1000



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mean_col = mean_col.astype(np.int)
