In [7]:
import pandas as pd
import numpy as np1
import re
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings("ignore")



In [8]:
def preprocess_text(text: str):
    text = re.sub("\‚Ç¨+", " –µ–≤—Ä–æ ", text)
    text = re.sub("\@\w+", " <tag> ", text)
    text = re.sub("\+\d+", " <phone_number> ", text)
    text = re.sub("\[.*?\]\(.*?\)", "", text)
    text = re.sub("[\*\!\.\,]+", " ", text)
    text = re.sub("\d+", " 1 ", text)
    text = re.sub("\s+", " ", text)
    
    return text

In [9]:
raw_df = pd.read_csv('labeled_data_corpus.csv')
train_df = raw_df.loc[raw_df['subset'] == 'train']
test_df = raw_df.loc[raw_df['subset'] == 'test']
train_df['text'] = train_df['msg'].apply(preprocess_text)
test_df['text'] = test_df['msg'].apply(preprocess_text)

In [10]:
train_df['msg'].apply(preprocess_text).sample(n=20).head(20)

772     –≤–∞–∂–Ω–æ: –µ—Å–ª–∏ —Ç—ã –Ω–µ –±–æ—Ç –∏ –Ω–µ —Å–ø–∞–º–µ—Ä –ø—Ä–æ–π–¥–∏ –ø—Ä–æ–≤–µ...
1011    –≤–∞–∂–Ω–æ: –µ—Å–ª–∏ —Ç—ã –Ω–µ –±–æ—Ç –∏ –Ω–µ —Å–ø–∞–º–µ—Ä –ø—Ä–æ–π–¥–∏ –ø—Ä–æ–≤–µ...
3182     hi my name is ekaterina me my husband and my ...
5133                              you were banned (spam) 
3587    –∏—â—É 1 –±–µ–¥—Ä—É–º –∫–≤–∞—Ä—Ç–∏—Ä—É –≤ –ø–∞—Ñ–æ—Å–µ –¥–æ 1 –µ–≤—Ä–æ –º–æ–∂–Ω–æ...
1518    –∑–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ –∏—â—É –∫–≤–∞—Ä—Ç–∏—Ä—É –≤ –ø–∞—Ñ–æ—Å–µ —Å–µ–º—å—è –∏–∑ –¥–≤...
2849    #–∞—Ä–µ–Ω–¥–∞ –∫–æ—Ä–∞–ª –±—ç–π/coral bay (–ø–∞—Ñ–æ—Å) 1 –µ–≤—Ä–æ 1 —Å...
3175     –ø—Ä–∏–≤–µ—Ç —ç—Ç–æ –±–µ—Å–ø–ª–∞—Ç–Ω–∞—è –≥—Ä—É–ø–ø–∞ –ø–æ –∞—Ä–µ–Ω–¥–µ –∏ –ø—Ä–æ–¥...
811     –¥–æ–±—Ä—ã–π –¥–µ–Ω—å –∏—â—É –æ–¥–Ω–æ–∫–æ–º–Ω–∞—Ç–Ω—É—é –∫–≤–∞—Ä—Ç–∏—Ä—É –≤ –ª–∞—Ä–Ω–∞...
2297     –∞—Ä–µ–Ω–¥–∞ | 1 -—Å–ø –∫–≤–∞—Ä—Ç–∏—Ä–∞ | 1 –º 1 | agios tycho...
1045    –∞—Ä–µ–Ω–¥–∞ 1 -—Ö –∫–æ–º–Ω–∞—Ç–Ω–æ–π –∫–≤–∞—Ä—Ç–∏—Ä—ã –ª–∏–º–∞—Å—Å–æ–ª —Ä–∞–π–æ–Ω ...
4207    –∏—â—É 1 —Å–ø –∫–≤–∞—Ä—Ç–∏—Ä—É –¥–ª—è –¥–æ–ª–≥–æ—Å—Ä

In [11]:
feature_names = ['text']
train_pool = Pool(
    train_df[feature_names], 
    train_df['label'], 
    text_features=["text"], 
    feature_names=feature_names
)

val_pool = Pool(
    test_df[feature_names], 
    test_df['label'], 
    text_features=["text"], 
    feature_names=feature_names
)


In [12]:
args = {
    "iterations" : 1000,
    "learning_rate" : 0.01,
    "loss_function" : 'Logloss',
    "eval_metric" : "F1",
    "verbose" : 50,
    "random_seed" : 42,
}
model = CatBoostClassifier(**args)

model.fit(train_pool, eval_set=val_pool)
preds_class = model.predict(val_pool, prediction_type='Class')
print("Final F1 Score", f1_score(test_df['label'], preds_class))

0:	learn: 0.7735164	test: 0.8054146	best: 0.8054146 (0)	total: 309ms	remaining: 5m 9s
50:	learn: 0.7960985	test: 0.8121442	best: 0.8183556 (4)	total: 8.26s	remaining: 2m 33s
100:	learn: 0.8101266	test: 0.8199234	best: 0.8199234 (74)	total: 14.3s	remaining: 2m 6s
150:	learn: 0.8207283	test: 0.8288973	best: 0.8288973 (143)	total: 20.9s	remaining: 1m 57s
200:	learn: 0.8300469	test: 0.8377358	best: 0.8377358 (192)	total: 26.5s	remaining: 1m 45s
250:	learn: 0.8380414	test: 0.8474576	best: 0.8490566 (243)	total: 32.1s	remaining: 1m 35s
300:	learn: 0.8455361	test: 0.8462998	best: 0.8490566 (243)	total: 38.9s	remaining: 1m 30s
350:	learn: 0.8522727	test: 0.8500949	best: 0.8500949 (335)	total: 45s	remaining: 1m 23s
400:	learn: 0.8599905	test: 0.8571429	best: 0.8571429 (394)	total: 51.4s	remaining: 1m 16s
450:	learn: 0.8613485	test: 0.8517110	best: 0.8571429 (394)	total: 57.5s	remaining: 1m 10s
500:	learn: 0.8673324	test: 0.8538899	best: 0.8571429 (394)	total: 1m 3s	remaining: 1m 3s
550:	learn: 

In [13]:
test_df['prediction'] = preds_class
test_df.loc[test_df['prediction'] != test_df['label'], ['text', 'label']].sample(20)

Unnamed: 0,text,label
5406,—Å–¥–∞–º –ø–æ—Ä—è–¥–æ—á–Ω–æ–π –¥–µ–≤—É—à–∫–µ –∫–æ–º–Ω–∞—Ç—É –≤ –¥–æ–º–µ –≤ —Ä–∞–π–æ–Ω...,1
6119,–∞—Ä–µ–Ω–¥–∞ –ø–µ–π—è —Ä–∞–π–æ–Ω –ø–∞—Ñ–æ—Å–∞ –≤ –¥–æ–ª–≥–æ—Å—Ä–æ—á–Ω—É—é –∞—Ä–µ–Ω–¥—É...,0
5929,- 1 adults 1 child 1 years old without animals...,0
5994,__ tommy‚Äôs estate agency 1 / 1 __ Ô∏è–∞—Ä–µ–Ω–¥–∞‚Ä¢–ø–æ—Ç–∞...,0
6231,agents|arto estates ltd | reg 1 | lic 1 sale: ...,0
5423,hi we_re a married couple with no children no ...,1
6239,office for rent office building rental agios n...,0
6535,#–∞—Ä–µ–Ω–¥–∞ –ø–µ–π—è —Ä–∞–π–æ–Ω –ø–∞—Ñ–æ—Å–∞ –≤ –¥–æ–ª–≥–æ—Å—Ä–æ—á–Ω—É—é –∞—Ä–µ–Ω–¥...,0
5620,–ª–∏–º–∞—Å–æ–ª –∞—Ä–µ–Ω–¥–∞ –ø–µ–Ω—Ç—Ö–∞—É—Å–∞ 1 –µ–≤—Ä–æ –≥–æ—Ç–æ–≤–∞ –∫ –∑–∞—Å–µ–ª...,1
6005,#–∞—Ä–µ–Ω–¥–∞–ø–∞—Ñ–æ—Å #–∞—Ä–µ–Ω–¥–∞ —Å—Ç—É–¥–∏–∏ –∏ 1 -—Ö —Å–ø–∞–ª—å–Ω–æ–π –∫–≤...,0
