In [1]:
from __future__ import division
from html.parser import HTMLParser
import re
import base64
import csv
import gzip
import zlib
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
%matplotlib inline
import matplotlib.pyplot as plt
from base64 import b64decode
import warnings
warnings.filterwarnings('ignore')

In [2]:
def to_utf8(text):
    if isinstance(text, str): text = text.encode('utf8')
    return text

def convert2unicode(f):
    def tmp(text):
        if not isinstance(text, str): text = text.decode('utf8')
        return f(text)
    return tmp

def convert2lower(f):
    def tmp(text):        
        return f(text.lower())
    return tmp

In [3]:
def html2text_bs(raw_html):
    from bs4 import BeautifulSoup
    """
    Тут производится извлечения из html текста
    """
    soup = BeautifulSoup(raw_html, "html.parser")
    [s.extract() for s in soup(['script', 'style'])]
    return soup.get_text()

def html2text_bs_visible(raw_html):
    from bs4 import BeautifulSoup
    """
    Тут производится извлечения из html текста, который видим пользователю
    """
    soup = BeautifulSoup(raw_html, "html.parser")    
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    return soup.get_text()

def html2text_boilerpipe(raw_html):
    import boilerpipe
    """
    еще одна библиотека очень хорошо извлекающая именно видимый пользователю текст,
    но она завязана на java
    """
    pass

In [4]:
@convert2lower
@convert2unicode
def easy_tokenizer(text):
    word = ""
    for symbol in text:
        if symbol.isalnum(): word += symbol
        elif word:
            yield word
            word = ""
    if word: yield word

PYMORPHY_CACHE = {}
MORPH = None
#hint, чтобы установка pymorphy2 не была бы обязательной
def get_lemmatizer():
    import pymorphy2
    global MORPH
    if MORPH is None: MORPH = pymorphy2.MorphAnalyzer()
    return MORPH

@convert2lower
@convert2unicode
def pymorphy_tokenizer(text):
    global PYMORPHY_CACHE
    for word in easy_tokenizer(text):
        word_hash = hash(word)
        if word_hash not in PYMORPHY_CACHE:
            PYMORPHY_CACHE[word_hash] = get_lemmatizer().parse(word)[0].normal_form            
        yield PYMORPHY_CACHE[word_hash]

In [5]:
class TextHTMLParser(HTMLParser):
    def __init__(self, tokenizer=easy_tokenizer):
        HTMLParser.__init__(self)
        self.text = []
        self.tokenizer = tokenizer
        self.anchor = []
        self.title = ""
        self.num_anchors = 0
        self.num_links = 0
        self.num_img = 0
        self.num_noscripts = 0
        self.num_scripts = 0
        self.num_meta = 0
        self.num_obj = 0
        self.num_div = 0
        self.num_span = 0
        self.num_styles = 0
        self.num_forms = 0
        self.num_fonts = 0
        self.num_audio = 0
        self.num_canvas = 0
        self.num_videos = 0
        self.num_input = 0
        self.num_noindex = 0
        self.num_output = 0
        self.num_progress = 0
        self.num_datalist = 0
        self.num_embed = 0
        self.num_articles = 0
        self.is_title = False
        self.is_anchor = False

    def handle_data(self, data):
        text = data.strip()
        if len(text) > 0:
            text = re.sub('[ \t\r\n]+', ' ', text)
            if self.is_anchor == True:
                self.anchor.append(text + ' ')
            if self.is_title == True:
                self.title = text
            if (not self.is_title) and (not self.is_anchor):
                self.text.append(text + ' ')

    def handle_starttag(self, tag, attrs):
        if tag == 'title':
            self.is_title = True
            return
        if tag == 'a':
            self.is_anchor = True
            self.num_anchors +=1
            return
        if tag == 'script':
            self.num_scripts += 1
        if tag == 'img':
            self.num_img += 1
        if tag == 'link':
            self.num_links += 1
        if tag == 'noscript':
            self.num_noscripts +=1
        if tag == 'meta':
            self.num_meta +=1
        if tag == 'object':
            self.num_obj +=1
        if tag == 'div':
            self.num_div +=1
        if tag == 'span':
            self.num_span +=1
        if tag == 'style':
            self.num_styles +=1
        if tag == 'form':
            self.num_forms +=1
        if tag == 'font':
            self.num_fonts +=1
        if tag == 'audio':
            self.num_audio +=1
        if tag == 'canvas':
            self.num_canvas +=1
        if tag == 'video':
            self.num_videos +=1
        if tag == 'input':
            self.num_input +=1
        if tag == 'noindex':
            self.num_noindex +=1
        if tag == 'output':
            self.num_output +=1
        if tag == 'progress':
            self.num_progress +=1
        if tag == 'datalist':
            self.num_datalist +=1
        if tag == 'embed':
            self.num_embed +=1
        if tag == 'article':
            self.num_articles +=1 
        if tag == 'p':
            self.text.append('\n\n')
        if tag == 'br':
            self.text.append('\n')
            
    def handle_endtag(self, tag):
        if tag == 'title':
            self.is_title = False 
        if tag == 'a':
            self.is_anchor = False

    def get_text(self):
        return ''.join(self.text).strip()
    def get_title(self):
        return self.title
    def get_anchor(self):
        return ''.join(self.anchor).strip()
    def get_num_img(self):
        return self.num_img
    def get_num_scripts(self):
        return self.num_scripts
    def get_num_scripts(self):
        return self.num_noscripts
    def get_num_links(self):
        return self.num_links
    def get_num_anchors(self):
        return self.num_anchors
    def get_num_meta(self):
        return self.num_meta
    def get_num_obj(self):
        return self.num_obj
    def get_num_div(self):
        return self.num_div
    def get_num_span(self):
        return self.num_span
    def get_num_styles(self):
        return self.num_styles
    def get_num_forms(self):
        return self.num_forms
    def get_num_fonts(self):
        return self.num_forms
    def get_num_audio(self):
        return self.num_audio
    def get_num_canvas(self):
        return self.num_canvas
    def get_num_videos(self):
        return self.num_videos
    def get_num_input(self):
        return self.num_input
    def get_num_noindex(self):
        return self.num_noindex
    def get_num_output(self):
        return self.num_output
    def get_num_progress(self):
        return self.num_progress
    def get_num_dalalist(self):
        return self.num_datalist
    def get_num_embed(self):
        return self.num_embed
    def get_num_articles(self):
        return self.num_articles

def html2text_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    txt = [word for word in easy_tokenizer(parser.get_text())]
    title = [word for word in easy_tokenizer(parser.get_title())]
    anchor = [word for word in easy_tokenizer(parser.get_anchor())]
    compr_level = len(text) / len(zlib.compress(text.encode('utf-8')))
    return {'text_words_num': len(txt),
            'text_word_avg_len':np.mean(np.array([len(word) for word in txt])),
            'titke_words_num': len(title),
            'title_word_avg_len':np.mean(np.array([len(word) for word in title])),
            'anchor_words_num': len(anchor),
            'anchor_word_avg_len':np.mean(np.array([len(word) for word in anchor])),
            'compression_level': compr_level,
            'num_scripts': parser.get_num_scripts(),
            'num_noscripts': parser.get_num_scripts(),
            'num_img': parser.get_num_img(),
            'num_anchors': parser.get_num_anchors(),
            'num_links': parser.get_num_links(),
            'num_meta': parser.get_num_meta(),
            'num_objects': parser.get_num_obj(),
            'num_span': parser.get_num_div(),
            'num_div': parser.get_num_span(),
            'num_styles': parser.get_num_styles(),
            'num_forms': parser.get_num_forms(),
            'num_fonts': parser.get_num_fonts(),
            'num_audio': parser.get_num_audio(),
            'num_canvas': parser.get_num_canvas(),
            'num_videos': parser.get_num_videos(),
            'num_input': parser.get_num_input(),
            'num_noindex': parser.get_num_noindex(),
            'num_output': parser.get_num_output(),
            'num_progress': parser.get_num_progress(),
            'num_datalist': parser.get_num_dalalist(),
            'num_embed': parser.get_num_embed(),
            'num_articles': parser.get_num_articles()}

def html2anchor_easy_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    anchor = [word for word in easy_tokenizer(parser.get_anchor())]
    return anchor

def html2anchor_morf_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    anchor = [word for word in pymorphy_tokenizer(parser.get_anchor())]
    return anchor

def html2title_easy_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    title = [word for word in easy_tokenizer(parser.get_title())]
    return title

def html2title_morf_parser(text):
    parser = TextHTMLParser()
    parser.feed(text)
    title = [word for word in pymorphy_tokenizer(parser.get_title())]
    return title

# Модель на фичах

In [6]:
def extract_features(file):
    tmp = open(file, "r", encoding="utf-8")
    num_str = -1
    for elem in tmp:
        num_str+=1
    tmp.close()
    X = []
    y = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in tqdm(range(num_str)):
        elem = raw_data.readline()
        raw_info = elem.strip().split('\t')
        cur_ans = int(raw_info[1])
        cur_url = raw_info[2]
        len_url = len(cur_url)
        dots_url = cur_url.count('.')
        slash_url = cur_url.count('/')
        html_url = cur_url.count('html')
        cur_txt = (b64decode(raw_info[3]).decode("utf-8", errors="replace"))
        obj_feats = []
        visibility = len(html2text_bs_visible(cur_txt))/len(html2text_bs(cur_txt))
        obj_dct = html2text_parser(cur_txt)
        for key in obj_dct:
            obj_feats.append(obj_dct[key])
        obj_feats.append(visibility)
        obj_feats.append(len_url)
        obj_feats.append(dots_url)
        obj_feats.append(slash_url)
        obj_feats.append(html_url)
        X.append(obj_feats)
        y.append(cur_ans)
    raw_data.close()
    return X, y

In [7]:
def extract_ids(file):
    ids = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in raw_data:
        raw_info = elem.strip().split('\t')
        ids.append(raw_info[0])
    raw_data.close()
    return ids

In [8]:
X_train_feats, y_train = extract_features('kaggle_train_data_tab.csv')
X_test_feats, y_test = extract_features('kaggle_test_data_tab.csv')
ids = extract_ids('kaggle_test_data_tab.csv')

100%|██████████| 7044/7044 [07:50<00:00, 14.96it/s]
100%|██████████| 16039/16039 [24:32<00:00, 10.89it/s]  


In [9]:
X_train_feats1 = np.nan_to_num(X_train_feats)
X_test_feats1 = np.nan_to_num(X_test_feats)

In [10]:
y_train1 = np.array(y_train)

In [11]:
X_test_feats1.shape

(16039, 34)

In [13]:
best_params = dict()
best_params['lr'] = -1
best_params['n_est'] = -1
best_params['max_depth'] = -1
best_params['subsample'] = -1
Folds = KFold(n_splits = 5)
lr = [0.01, 0.1,  0.5, 0.75, 1]
n_est = [200, 300, 400, 500]
max_depth = [5, 6, 7, 8, 9]
subsample = [0.5, 0.75, 1]
score = []
res = 0
best_score = 0
print(len(lr)*len(n_est)*len(max_depth)*len(subsample))
for i in range(len(lr)):
    for j in range(len(n_est)):
        for k in range(len(max_depth)):
            for s in range(len(subsample)):
                for train_id , test_id in Folds.split(X_train_feats1):
                    X_tr = X_train_feats1[train_id]
                    X_tst  = X_train_feats1[test_id]
                    y_tr = y_train1[train_id]
                    y_tst = y_train1[test_id]
                    my_clf = XGBClassifier(learning_rate = lr[i],
                                           n_estimators = n_est[j] , 
                                           max_depth = max_depth[k], 
                                           subsample = subsample[s], 
                                           verbosity = 0)
                    my_clf.fit(X_tr, y_tr)
                    res += f1_score(y_pred=my_clf.predict(X_tst), y_true=y_tst)
                res = res/5
                score.append(res)
                print('learning_rate = ', lr[i], 
                                  ', num_est = ', n_est[j], 
                                  ', max_depth = ', max_depth[k],
                                  ', sub', subsample[s],
                                  ', Score = ', res)
                if res > best_score:
                    best_score = res
                    best_params['lr'] = lr[i]
                    best_params['n_est'] = n_est[j]
                    best_params['max_depth'] = max_depth[k]
                    best_params['subsample'] = subsample[s]
                res = 0

300
learning_rate =  0.01 , num_est =  200 , max_depth =  5 , sub 0.5 , Score =  0.9440076018152507
learning_rate =  0.01 , num_est =  200 , max_depth =  5 , sub 0.75 , Score =  0.9426091151144067
learning_rate =  0.01 , num_est =  200 , max_depth =  5 , sub 1 , Score =  0.9395384177628797
learning_rate =  0.01 , num_est =  200 , max_depth =  6 , sub 0.5 , Score =  0.9508703222290121
learning_rate =  0.01 , num_est =  200 , max_depth =  6 , sub 0.75 , Score =  0.9513261724042181
learning_rate =  0.01 , num_est =  200 , max_depth =  6 , sub 1 , Score =  0.9490377110432512
learning_rate =  0.01 , num_est =  200 , max_depth =  7 , sub 0.5 , Score =  0.9546536567044681
learning_rate =  0.01 , num_est =  200 , max_depth =  7 , sub 0.75 , Score =  0.9552867377274525
learning_rate =  0.01 , num_est =  200 , max_depth =  7 , sub 1 , Score =  0.9519160100711641
learning_rate =  0.01 , num_est =  200 , max_depth =  8 , sub 0.5 , Score =  0.9552056875808775
learning_rate =  0.01 , num_est =  200 

learning_rate =  0.1 , num_est =  300 , max_depth =  8 , sub 1 , Score =  0.9658530042838143
learning_rate =  0.1 , num_est =  300 , max_depth =  9 , sub 0.5 , Score =  0.9660788953180619
learning_rate =  0.1 , num_est =  300 , max_depth =  9 , sub 0.75 , Score =  0.9680612927653132
learning_rate =  0.1 , num_est =  300 , max_depth =  9 , sub 1 , Score =  0.9657465750864572
learning_rate =  0.1 , num_est =  400 , max_depth =  5 , sub 0.5 , Score =  0.9669473504221496
learning_rate =  0.1 , num_est =  400 , max_depth =  5 , sub 0.75 , Score =  0.9681747694248936
learning_rate =  0.1 , num_est =  400 , max_depth =  5 , sub 1 , Score =  0.9658420930016345
learning_rate =  0.1 , num_est =  400 , max_depth =  6 , sub 0.5 , Score =  0.9680428183788387
learning_rate =  0.1 , num_est =  400 , max_depth =  6 , sub 0.75 , Score =  0.9677180050159697
learning_rate =  0.1 , num_est =  400 , max_depth =  6 , sub 1 , Score =  0.96749105372553
learning_rate =  0.1 , num_est =  400 , max_depth =  7 , 

learning_rate =  0.5 , num_est =  500 , max_depth =  7 , sub 1 , Score =  0.9643459532631615
learning_rate =  0.5 , num_est =  500 , max_depth =  8 , sub 0.5 , Score =  0.963077290347643
learning_rate =  0.5 , num_est =  500 , max_depth =  8 , sub 0.75 , Score =  0.9663562109633046
learning_rate =  0.5 , num_est =  500 , max_depth =  8 , sub 1 , Score =  0.9664677542271729
learning_rate =  0.5 , num_est =  500 , max_depth =  9 , sub 0.5 , Score =  0.963998850722881
learning_rate =  0.5 , num_est =  500 , max_depth =  9 , sub 0.75 , Score =  0.9673104874225977
learning_rate =  0.5 , num_est =  500 , max_depth =  9 , sub 1 , Score =  0.9654357430023328
learning_rate =  0.75 , num_est =  200 , max_depth =  5 , sub 0.5 , Score =  0.9573880983651157
learning_rate =  0.75 , num_est =  200 , max_depth =  5 , sub 0.75 , Score =  0.9633113438945434
learning_rate =  0.75 , num_est =  200 , max_depth =  5 , sub 1 , Score =  0.9643148965639284
learning_rate =  0.75 , num_est =  200 , max_depth =  

learning_rate =  1 , num_est =  300 , max_depth =  6 , sub 1 , Score =  0.9642991683143544
learning_rate =  1 , num_est =  300 , max_depth =  7 , sub 0.5 , Score =  0.9577171288775566
learning_rate =  1 , num_est =  300 , max_depth =  7 , sub 0.75 , Score =  0.9613570598987117
learning_rate =  1 , num_est =  300 , max_depth =  7 , sub 1 , Score =  0.9645899073002241
learning_rate =  1 , num_est =  300 , max_depth =  8 , sub 0.5 , Score =  0.9553718244429522
learning_rate =  1 , num_est =  300 , max_depth =  8 , sub 0.75 , Score =  0.9621973375034342
learning_rate =  1 , num_est =  300 , max_depth =  8 , sub 1 , Score =  0.9647974037217921
learning_rate =  1 , num_est =  300 , max_depth =  9 , sub 0.5 , Score =  0.9552639316839997
learning_rate =  1 , num_est =  300 , max_depth =  9 , sub 0.75 , Score =  0.9620523802914045
learning_rate =  1 , num_est =  300 , max_depth =  9 , sub 1 , Score =  0.964718721555279
learning_rate =  1 , num_est =  400 , max_depth =  5 , sub 0.5 , Score =  0.

In [14]:
best_params

{'lr': 0.1, 'n_est': 400, 'max_depth': 9, 'subsample': 0.75}

In [15]:
best_score

0.9691317090716793

In [16]:
model = XGBClassifier(learning_rate = best_params['lr'], n_estimators=best_params['n_est'], max_depth = best_params['max_depth'], subsample = best_params['subsample'])

In [17]:
model.fit(X_train_feats1, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=9,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=400, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.75,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
res = model.predict_proba(X_test_feats1)

# Модель на сырых урлах:

In [41]:
def extract_raw_url(file):
    tmp = open(file, "r", encoding="utf-8")
    num_str = -1
    for elem in tmp:
        num_str+=1
    tmp.close()
    X = []
    y = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in tqdm(range(num_str)):
        elem = raw_data.readline()
        raw_info = elem.strip().split('\t')
        cur_ans = int(raw_info[1])
        cur_url = raw_info[2]
        parsed_url = ' '.join(re.split(r'[ /:.?!=%]+', cur_url))
        X.append(parsed_url)
        y.append(cur_ans)
    raw_data.close()
    return X, y

In [42]:
X_train_raw_url, y_train = extract_raw_url('kaggle_train_data_tab.csv')
X_test_raw_url, y_test = extract_raw_url('kaggle_test_data_tab.csv')

100%|██████████| 7044/7044 [00:01<00:00, 6958.70it/s]
100%|██████████| 16039/16039 [00:02<00:00, 6087.83it/s]


In [43]:
y_train = np.array(y_train)

In [44]:
#max_df
Folds = KFold(n_splits = 5)
log_params = dict()
log_params['score'] = 0
log_params['ngram_range'] = None
log_params['C'] = None
log_params['max_df'] = None
log_params['dec_err'] = None
errs = ['ignore', 'replace', 'strict']
ngrams = [1, 2, 3]
coeff = [1, 10, 100, 1000, 10000]
max_df = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
for ngr in ngrams:
    for mdf in max_df:
        for err in errs:
            vectorizer = TfidfVectorizer(ngram_range = (1, ngr), max_df = mdf, decode_error=err, norm = 'l2', sublinear_tf = True)
            vectorizer.fit(X_train_raw_url)
            X_train_raw_url1 = vectorizer.transform(X_train_raw_url)
            for coeff1 in coeff:
                total_res = 0
                for train_id , test_id in Folds.split(X_train_raw_url1):
                    X1 = X_train_raw_url1[train_id]
                    X2  = X_train_raw_url1[test_id]
                    y1 = y_train[train_id]
                    y2 = y_train[test_id]
                    clf = LogisticRegression(C = coeff1)
                    clf.fit(X1, y1)
                    y_pred = clf.predict(X2)
                    total_res += f1_score(y_pred, y2)
                total_res /= 5
                print('ngram = ', ngr, ' C = ', coeff1, ' dec_err = ', err, ' max_df = ',mdf, ', score = ', total_res)
                if total_res > log_params['score']:
                    log_params['score'] = total_res
                    log_params['ngram_range'] = ngr
                    log_params['C'] = coeff1
                    log_params['dec_err'] = err
                    log_params['max_df'] = mdf

ngram =  1  C =  1  dec_err =  ignore  max_df =  0.7 , score =  0.9451705473861894
ngram =  1  C =  10  dec_err =  ignore  max_df =  0.7 , score =  0.9614956260086378
ngram =  1  C =  100  dec_err =  ignore  max_df =  0.7 , score =  0.9654146061023642
ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.7 , score =  0.9658995666380392
ngram =  1  C =  10000  dec_err =  ignore  max_df =  0.7 , score =  0.9663671591363954
ngram =  1  C =  1  dec_err =  replace  max_df =  0.7 , score =  0.9451705473861894
ngram =  1  C =  10  dec_err =  replace  max_df =  0.7 , score =  0.9614956260086378
ngram =  1  C =  100  dec_err =  replace  max_df =  0.7 , score =  0.9654146061023642
ngram =  1  C =  1000  dec_err =  replace  max_df =  0.7 , score =  0.9658995666380392
ngram =  1  C =  10000  dec_err =  replace  max_df =  0.7 , score =  0.9663671591363954
ngram =  1  C =  1  dec_err =  strict  max_df =  0.7 , score =  0.9451705473861894
ngram =  1  C =  10  dec_err =  strict  max_df =  0.7 , score 

ngram =  1  C =  100  dec_err =  replace  max_df =  1 , score =  0.0
ngram =  1  C =  1000  dec_err =  replace  max_df =  1 , score =  0.0
ngram =  1  C =  10000  dec_err =  replace  max_df =  1 , score =  0.0
ngram =  1  C =  1  dec_err =  strict  max_df =  1 , score =  0.7367924226855553
ngram =  1  C =  10  dec_err =  strict  max_df =  1 , score =  0.7367924226855553
ngram =  1  C =  100  dec_err =  strict  max_df =  1 , score =  0.0
ngram =  1  C =  1000  dec_err =  strict  max_df =  1 , score =  0.0
ngram =  1  C =  10000  dec_err =  strict  max_df =  1 , score =  0.0
ngram =  2  C =  1  dec_err =  ignore  max_df =  0.7 , score =  0.9459260165292989
ngram =  2  C =  10  dec_err =  ignore  max_df =  0.7 , score =  0.9612750813742743
ngram =  2  C =  100  dec_err =  ignore  max_df =  0.7 , score =  0.9658071648055186
ngram =  2  C =  1000  dec_err =  ignore  max_df =  0.7 , score =  0.9668693906006615
ngram =  2  C =  10000  dec_err =  ignore  max_df =  0.7 , score =  0.966854401542

ngram =  2  C =  10000  dec_err =  strict  max_df =  0.95 , score =  0.966854401542319
ngram =  2  C =  1  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  100  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  1000  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10000  dec_err =  ignore  max_df =  1 , score =  0.4416069617688884
ngram =  2  C =  1  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  100  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  1000  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10000  dec_err =  replace  max_df =  1 , score =  0.4416069617688884
ngram =  2  C =  1  dec_err =  strict  max_df =  1 , score =  0.73679242268555

ngram =  3  C =  1  dec_err =  replace  max_df =  0.95 , score =  0.9355378920334816
ngram =  3  C =  10  dec_err =  replace  max_df =  0.95 , score =  0.9581565941970875
ngram =  3  C =  100  dec_err =  replace  max_df =  0.95 , score =  0.9618014182483826
ngram =  3  C =  1000  dec_err =  replace  max_df =  0.95 , score =  0.9640377661941552
ngram =  3  C =  10000  dec_err =  replace  max_df =  0.95 , score =  0.9648872019810245
ngram =  3  C =  1  dec_err =  strict  max_df =  0.95 , score =  0.9355378920334816
ngram =  3  C =  10  dec_err =  strict  max_df =  0.95 , score =  0.9581565941970875
ngram =  3  C =  100  dec_err =  strict  max_df =  0.95 , score =  0.9618014182483826
ngram =  3  C =  1000  dec_err =  strict  max_df =  0.95 , score =  0.9640377661941552
ngram =  3  C =  10000  dec_err =  strict  max_df =  0.95 , score =  0.9648872019810245
ngram =  3  C =  1  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  3  C =  10  dec_err =  ignore  max_df =  1 , 

In [45]:
log_params

{'score': 0.9668693906006615,
 'ngram_range': 2,
 'C': 1000,
 'max_df': 0.7,
 'dec_err': 'ignore'}

In [46]:
vectorizer_raw_url = TfidfVectorizer(ngram_range = (1, 2), max_df = 0.7, decode_error = 'ignore', sublinear_tf = True)
vectorizer_raw_url.fit(X_train_raw_url)
X_train_raw_url1 = vectorizer_raw_url.transform(X_train_raw_url)
X_test_raw_url1 = vectorizer_raw_url.transform(X_test_raw_url)

In [47]:
X_train_raw_url1.shape

(7044, 49083)

In [48]:
url_model = LogisticRegression(C = 1000)
url_model.fit(X_train_raw_url1, np.array(y_train))
url_pred = url_model.predict_proba(X_test_raw_url1)

# Модель на очищенных урлах:

In [49]:
def extract_cleaned_url(file):
    tmp = open(file, "r", encoding="utf-8")
    num_str = -1
    for elem in tmp:
        num_str+=1
    tmp.close()
    X = []
    y = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in tqdm(range(num_str)):
        elem = raw_data.readline()
        raw_info = elem.strip().split('\t')
        cur_ans = int(raw_info[1])
        cur_url = raw_info[2]
        tokens = []
        parsed_url = ' '.join(re.split(r'[ /:.?!=%]+', cur_url))
        for word in pymorphy_tokenizer(parsed_url):
            tokens.append(word)
        morfed_url = ' '.join(tokens)
        X.append(morfed_url)
        y.append(cur_ans)
    raw_data.close()
    return X, y

In [50]:
X_train_clean_url, y_train = extract_cleaned_url('kaggle_train_data_tab.csv')
X_test_clean_url, y_test = extract_cleaned_url('kaggle_test_data_tab.csv')

100%|██████████| 7044/7044 [00:01<00:00, 6294.01it/s]
100%|██████████| 16039/16039 [00:02<00:00, 5406.24it/s]


In [51]:
y_train = np.array(y_train)

In [52]:
Folds = KFold(n_splits = 5)
log_params = dict()
log_params['score'] = 0
log_params['ngram_range'] = None
log_params['C'] = None
log_params['max_df'] = None
log_params['dec_err'] = None
errs = ['ignore', 'replace', 'strict']
ngrams = [1, 2, 3]
coeff = [1, 10, 100, 1000, 10000]
max_df = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
for ngr in ngrams:
    for mdf in max_df:
        for err in errs:
            vectorizer = TfidfVectorizer(ngram_range = (1, ngr), max_df = mdf, decode_error=err, norm = 'l2', sublinear_tf = True)
            vectorizer.fit(X_train_clean_url)
            X_train_clean_url1 = vectorizer.transform(X_train_clean_url)
            for coeff1 in coeff:
                total_res = 0
                for train_id , test_id in Folds.split(X_train_clean_url1):
                    X1 = X_train_clean_url1[train_id]
                    X2  = X_train_clean_url1[test_id]
                    y1 = y_train[train_id]
                    y2 = y_train[test_id]
                    clf = LogisticRegression(C = coeff1)
                    clf.fit(X1, y1)
                    y_pred = clf.predict(X2)
                    total_res += f1_score(y_pred, y2)
                total_res /= 5
                print('ngram = ', ngr, ' C = ', coeff1, ' dec_err = ', err, ' max_df = ',mdf, ', score = ', total_res)
                if total_res > log_params['score']:
                    log_params['score'] = total_res
                    log_params['ngram_range'] = ngr
                    log_params['C'] = coeff1
                    log_params['dec_err'] = err
                    log_params['max_df'] = mdf

ngram =  1  C =  1  dec_err =  ignore  max_df =  0.7 , score =  0.9460671180600027
ngram =  1  C =  10  dec_err =  ignore  max_df =  0.7 , score =  0.962195850962409
ngram =  1  C =  100  dec_err =  ignore  max_df =  0.7 , score =  0.9665152180010969
ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.7 , score =  0.9675999299382797
ngram =  1  C =  10000  dec_err =  ignore  max_df =  0.7 , score =  0.9675968847506373
ngram =  1  C =  1  dec_err =  replace  max_df =  0.7 , score =  0.9460671180600027
ngram =  1  C =  10  dec_err =  replace  max_df =  0.7 , score =  0.962195850962409
ngram =  1  C =  100  dec_err =  replace  max_df =  0.7 , score =  0.9665152180010969
ngram =  1  C =  1000  dec_err =  replace  max_df =  0.7 , score =  0.9675999299382797
ngram =  1  C =  10000  dec_err =  replace  max_df =  0.7 , score =  0.9675968847506373
ngram =  1  C =  1  dec_err =  strict  max_df =  0.7 , score =  0.9460671180600027
ngram =  1  C =  10  dec_err =  strict  max_df =  0.7 , score = 

ngram =  1  C =  100  dec_err =  replace  max_df =  1 , score =  0.0
ngram =  1  C =  1000  dec_err =  replace  max_df =  1 , score =  0.0
ngram =  1  C =  10000  dec_err =  replace  max_df =  1 , score =  0.0
ngram =  1  C =  1  dec_err =  strict  max_df =  1 , score =  0.7367924226855553
ngram =  1  C =  10  dec_err =  strict  max_df =  1 , score =  0.7367924226855553
ngram =  1  C =  100  dec_err =  strict  max_df =  1 , score =  0.0
ngram =  1  C =  1000  dec_err =  strict  max_df =  1 , score =  0.0
ngram =  1  C =  10000  dec_err =  strict  max_df =  1 , score =  0.0
ngram =  2  C =  1  dec_err =  ignore  max_df =  0.7 , score =  0.946564111322623
ngram =  2  C =  10  dec_err =  ignore  max_df =  0.7 , score =  0.9621415590279676
ngram =  2  C =  100  dec_err =  ignore  max_df =  0.7 , score =  0.9658562064236793
ngram =  2  C =  1000  dec_err =  ignore  max_df =  0.7 , score =  0.9670441567809199
ngram =  2  C =  10000  dec_err =  ignore  max_df =  0.7 , score =  0.9673717592193

ngram =  2  C =  10000  dec_err =  strict  max_df =  0.95 , score =  0.9673717592193102
ngram =  2  C =  1  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  100  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  1000  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10000  dec_err =  ignore  max_df =  1 , score =  0.4416069617688884
ngram =  2  C =  1  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  100  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  1000  dec_err =  replace  max_df =  1 , score =  0.7367924226855553
ngram =  2  C =  10000  dec_err =  replace  max_df =  1 , score =  0.4416069617688884
ngram =  2  C =  1  dec_err =  strict  max_df =  1 , score =  0.7367924226855

ngram =  3  C =  10  dec_err =  replace  max_df =  0.95 , score =  0.959714144219323
ngram =  3  C =  100  dec_err =  replace  max_df =  0.95 , score =  0.9628050622358841
ngram =  3  C =  1000  dec_err =  replace  max_df =  0.95 , score =  0.9642996666415538
ngram =  3  C =  10000  dec_err =  replace  max_df =  0.95 , score =  0.9654051909503953
ngram =  3  C =  1  dec_err =  strict  max_df =  0.95 , score =  0.9360585911881125
ngram =  3  C =  10  dec_err =  strict  max_df =  0.95 , score =  0.959714144219323
ngram =  3  C =  100  dec_err =  strict  max_df =  0.95 , score =  0.9628050622358841
ngram =  3  C =  1000  dec_err =  strict  max_df =  0.95 , score =  0.9642996666415538
ngram =  3  C =  10000  dec_err =  strict  max_df =  0.95 , score =  0.9654051909503953
ngram =  3  C =  1  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  3  C =  10  dec_err =  ignore  max_df =  1 , score =  0.7367924226855553
ngram =  3  C =  100  dec_err =  ignore  max_df =  1 , scor

In [53]:
log_params

{'score': 0.9675999299382797,
 'ngram_range': 1,
 'C': 1000,
 'max_df': 0.7,
 'dec_err': 'ignore'}

In [54]:
vectorizer_clean_url = TfidfVectorizer(ngram_range = (1, 1), max_df = 0.7, decode_error = 'ignore', sublinear_tf = True)
vectorizer_clean_url.fit(X_train_clean_url)
X_train_clean_url1 = vectorizer_clean_url.transform(X_train_clean_url)
X_test_clean_url1 = vectorizer_clean_url.transform(X_test_clean_url)

In [55]:
X_train_clean_url1.shape

(7044, 16442)

In [56]:
url_clean_model = LogisticRegression(C = 1000)
url_clean_model.fit(X_train_clean_url1, np.array(y_train))
url_clean_pred = url_clean_model.predict_proba(X_test_clean_url1)

# Модель на сырых анкорах:

In [62]:
def extract_raw_anchor(file):
    tmp = open(file, "r", encoding="utf-8")
    num_str = -1
    for elem in tmp:
        num_str+=1
    tmp.close()
    X = []
    y = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in tqdm(range(num_str)):
        elem = raw_data.readline()
        raw_info = elem.strip().split('\t')
        cur_ans = int(raw_info[1])
        cur_txt = raw_info[3]
        cur_txt = b64decode(cur_txt).decode("utf-8", errors="replace")
        cur_anchor = html2anchor_easy_parser(cur_txt)
        res = ' '.join(cur_anchor)
        X.append(res)
        y.append(cur_ans)
    raw_data.close()
    return X, y

In [63]:
X_train_raw_anchor, y_train = extract_raw_anchor('kaggle_train_data_tab.csv')
X_test_raw_anchor, y_test = extract_raw_anchor('kaggle_test_data_tab.csv')

100%|██████████| 7044/7044 [01:07<00:00, 104.13it/s]
100%|██████████| 16039/16039 [03:07<00:00, 85.54it/s] 


In [64]:
y_train = np.array(y_train)

In [66]:
Folds = KFold(n_splits = 3)
log_params = dict()
log_params['score'] = 0
log_params['ngram_range'] = None
log_params['C'] = None
log_params['max_df'] = None
log_params['dec_err'] = None
errs = ['ignore', 'strict']
ngrams = [1, 2, 3]
coeff = [1000, 10000]
max_df = [0.7, 0.85, 1]
for ngr in ngrams:
    for mdf in max_df:
        for err in errs:
            vectorizer = TfidfVectorizer(ngram_range = (1, ngr), max_df = mdf, decode_error=err, norm = 'l2', sublinear_tf = True)
            vectorizer.fit(X_train_raw_anchor)
            X_train_raw_anchor1 = vectorizer.transform(X_train_raw_anchor)
            for coeff1 in coeff:
                total_res = 0
                for train_id , test_id in Folds.split(X_train_raw_anchor1):
                    X1 = X_train_raw_anchor1[train_id]
                    X2  = X_train_raw_anchor1[test_id]
                    y1 = y_train[train_id]
                    y2 = y_train[test_id]
                    clf = LogisticRegression(C = coeff1)
                    clf.fit(X1, y1)
                    y_pred = clf.predict(X2)
                    total_res += f1_score(y_pred, y2)
                total_res /= 3
                print('ngram = ', ngr, ' C = ', coeff1, ' dec_err = ', err, ' max_df = ',mdf, ', score = ', total_res)
                if total_res > log_params['score']:
                    log_params['score'] = total_res
                    log_params['ngram_range'] = ngr
                    log_params['C'] = coeff1
                    log_params['dec_err'] = err
                    log_params['max_df'] = mdf

ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.7 , score =  0.9712187634701447
ngram =  1  C =  10000  dec_err =  ignore  max_df =  0.7 , score =  0.9712386775164551
ngram =  1  C =  1000  dec_err =  strict  max_df =  0.7 , score =  0.9712187634701447
ngram =  1  C =  10000  dec_err =  strict  max_df =  0.7 , score =  0.9712386775164551
ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.85 , score =  0.9712187634701447
ngram =  1  C =  10000  dec_err =  ignore  max_df =  0.85 , score =  0.9712386775164551
ngram =  1  C =  1000  dec_err =  strict  max_df =  0.85 , score =  0.9712187634701447
ngram =  1  C =  10000  dec_err =  strict  max_df =  0.85 , score =  0.9712386775164551
ngram =  1  C =  1000  dec_err =  ignore  max_df =  1 , score =  0.7367999032840951
ngram =  1  C =  10000  dec_err =  ignore  max_df =  1 , score =  0.7367999032840951
ngram =  1  C =  1000  dec_err =  strict  max_df =  1 , score =  0.7367999032840951
ngram =  1  C =  10000  dec_err =  strict  max_df =

In [67]:
log_params

{'score': 0.9712386775164551,
 'ngram_range': 1,
 'C': 10000,
 'max_df': 0.7,
 'dec_err': 'ignore'}

In [68]:
vectorizer_raw_anchor = TfidfVectorizer(ngram_range = (1, 1), max_df = 0.7, decode_error = 'ignore', sublinear_tf = True)
vectorizer_raw_anchor.fit(X_train_raw_anchor)
X_train_raw_anchor1 = vectorizer_raw_anchor.transform(X_train_raw_anchor)
X_test_raw_anchor1 = vectorizer_raw_anchor.transform(X_test_raw_anchor)

In [69]:
X_train_raw_anchor1.shape

(7044, 170097)

In [70]:
anchor_raw_model = LogisticRegression(C = 10000)
anchor_raw_model.fit(X_train_raw_anchor1, np.array(y_train))
anchor_raw_pred = anchor_raw_model.predict_proba(X_test_raw_anchor1)

# Модель на очищенных анкорах:

In [71]:
def extract_cleaned_anchor(file):
    tmp = open(file, "r", encoding="utf-8")
    num_str = -1
    for elem in tmp:
        num_str+=1
    tmp.close()
    X = []
    y = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in tqdm(range(num_str)):
        elem = raw_data.readline()
        raw_info = elem.strip().split('\t')
        cur_ans = int(raw_info[1])
        cur_txt = raw_info[3]
        cur_txt = b64decode(cur_txt).decode("utf-8", errors="replace")
        cur_anchor = html2anchor_morf_parser(cur_txt)
        res = ' '.join(cur_anchor)
        X.append(res)
        y.append(cur_ans)
    raw_data.close()
    return X, y

In [72]:
X_train_clean_anchor, y_train = extract_cleaned_anchor('kaggle_train_data_tab.csv')
X_test_clean_anchor, y_test = extract_cleaned_anchor('kaggle_test_data_tab.csv')

100%|██████████| 7044/7044 [01:31<00:00, 77.21it/s] 
100%|██████████| 16039/16039 [03:29<00:00, 76.44it/s] 


In [73]:
y_train = np.array(y_train)

In [74]:
Folds = KFold(n_splits = 3)
log_params = dict()
log_params['score'] = 0
log_params['ngram_range'] = None
log_params['C'] = None
log_params['max_df'] = None
log_params['dec_err'] = None
errs = ['ignore', 'strict']
ngrams = [1, 2, 3]
coeff = [1000, 10000]
max_df = [0.7, 0.85, 1]
for ngr in ngrams:
    for mdf in max_df:
        for err in errs:
            vectorizer = TfidfVectorizer(ngram_range = (1, ngr), max_df = mdf, decode_error=err, norm = 'l2', sublinear_tf = True)
            vectorizer.fit(X_train_raw_anchor)
            X_train_clean_anchor1 = vectorizer.transform(X_train_clean_anchor)
            for coeff1 in coeff:
                total_res = 0
                for train_id , test_id in Folds.split(X_train_raw_anchor1):
                    X1 = X_train_clean_anchor1[train_id]
                    X2  = X_train_clean_anchor1[test_id]
                    y1 = y_train[train_id]
                    y2 = y_train[test_id]
                    clf = LogisticRegression(C = coeff1)
                    clf.fit(X1, y1)
                    y_pred = clf.predict(X2)
                    total_res += f1_score(y_pred, y2)
                total_res /= 3
                print('ngram = ', ngr, ' C = ', coeff1, ' dec_err = ', err, ' max_df = ',mdf, ', score = ', total_res)
                if total_res > log_params['score']:
                    log_params['score'] = total_res
                    log_params['ngram_range'] = ngr
                    log_params['C'] = coeff1
                    log_params['dec_err'] = err
                    log_params['max_df'] = mdf

ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.7 , score =  0.967685962927917
ngram =  1  C =  10000  dec_err =  ignore  max_df =  0.7 , score =  0.9678180057335166
ngram =  1  C =  1000  dec_err =  strict  max_df =  0.7 , score =  0.967685962927917
ngram =  1  C =  10000  dec_err =  strict  max_df =  0.7 , score =  0.9678180057335166
ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.85 , score =  0.967685962927917
ngram =  1  C =  10000  dec_err =  ignore  max_df =  0.85 , score =  0.9678180057335166
ngram =  1  C =  1000  dec_err =  strict  max_df =  0.85 , score =  0.967685962927917
ngram =  1  C =  10000  dec_err =  strict  max_df =  0.85 , score =  0.9678180057335166
ngram =  1  C =  1000  dec_err =  ignore  max_df =  1 , score =  0.7984722112936486
ngram =  1  C =  10000  dec_err =  ignore  max_df =  1 , score =  0.7997217095726795
ngram =  1  C =  1000  dec_err =  strict  max_df =  1 , score =  0.7984722112936486
ngram =  1  C =  10000  dec_err =  strict  max_df =  1 

In [75]:
log_params

{'score': 0.970431225621541,
 'ngram_range': 2,
 'C': 1000,
 'max_df': 0.7,
 'dec_err': 'ignore'}

In [76]:
vectorizer_clean_anchor = TfidfVectorizer(ngram_range = (1, 2), max_df = 0.7, decode_error = 'ignore', sublinear_tf = True)
vectorizer_clean_anchor.fit(X_train_clean_anchor)
X_train_clean_anchor1 = vectorizer_clean_anchor.transform(X_train_clean_anchor)
X_test_clean_anchor1 = vectorizer_clean_anchor.transform(X_test_clean_anchor)

In [77]:
X_train_clean_anchor1.shape

(7044, 849745)

In [78]:
anchor_clean_model = LogisticRegression(C = 1000)
anchor_clean_model.fit(X_train_clean_anchor1, np.array(y_train))
anchor_clean_pred = anchor_clean_model.predict_proba(X_test_clean_anchor1)

# Модель на очищенных заголовках

In [79]:
def extract_cleaned_title(file):
    tmp = open(file, "r", encoding="utf-8")
    num_str = -1
    for elem in tmp:
        num_str+=1
    tmp.close()
    X = []
    y = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in tqdm(range(num_str)):
        elem = raw_data.readline()
        raw_info = elem.strip().split('\t')
        cur_ans = int(raw_info[1])
        cur_txt = raw_info[3]
        cur_txt = b64decode(cur_txt).decode("utf-8", errors="replace")
        cur_title = html2title_morf_parser(cur_txt)
        res = ' '.join(cur_title)
        X.append(res)
        y.append(cur_ans)
    raw_data.close()
    return X, y

In [80]:
X_train_clean_title, y_train = extract_cleaned_title('kaggle_train_data_tab.csv')
X_test_clean_title, y_test = extract_cleaned_title('kaggle_test_data_tab.csv')

100%|██████████| 7044/7044 [01:07<00:00, 104.05it/s]
100%|██████████| 16039/16039 [03:04<00:00, 86.76it/s] 


In [85]:
y_train = np.array(y_train)

In [88]:
Folds = KFold(n_splits = 6)
log_params = dict()
log_params['score'] = 0
log_params['ngram_range'] = None
log_params['C'] = None
log_params['max_df'] = None
log_params['dec_err'] = None
errs = ['ignore', 'replace', 'strict']
ngrams = [1, 2, 3]
coeff = [1000, 5000, 10000]
max_df = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]
for ngr in ngrams:
    for mdf in max_df:
        for err in errs:
            vectorizer = TfidfVectorizer(ngram_range = (1, ngr), max_df = mdf, decode_error=err, norm = 'l2', sublinear_tf = True)
            vectorizer.fit(X_train_clean_url)
            X_train_clean_title1 = vectorizer.transform(X_train_clean_title)
            for coeff1 in coeff:
                total_res = 0
                for train_id , test_id in Folds.split(X_train_clean_title1):
                    X1 = X_train_clean_title1[train_id]
                    X2  = X_train_clean_title1[test_id]
                    y1 = y_train[train_id]
                    y2 = y_train[test_id]
                    clf = LogisticRegression(C = coeff1)
                    clf.fit(X1, y1)
                    y_pred = clf.predict(X2)
                    total_res += f1_score(y_pred, y2)
                total_res /= 6
                print('ngram = ', ngr, ' C = ', coeff1, ' dec_err = ', err, ' max_df = ',mdf, ', score = ', total_res)
                if total_res > log_params['score']:
                    log_params['score'] = total_res
                    log_params['ngram_range'] = ngr
                    log_params['C'] = coeff1
                    log_params['dec_err'] = err
                    log_params['max_df'] = mdf

ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.7 , score =  0.7696100047741248
ngram =  1  C =  5000  dec_err =  ignore  max_df =  0.7 , score =  0.7685313704210457
ngram =  1  C =  10000  dec_err =  ignore  max_df =  0.7 , score =  0.7678233652448938
ngram =  1  C =  1000  dec_err =  replace  max_df =  0.7 , score =  0.7696100047741248
ngram =  1  C =  5000  dec_err =  replace  max_df =  0.7 , score =  0.7685313704210457
ngram =  1  C =  10000  dec_err =  replace  max_df =  0.7 , score =  0.7678233652448938
ngram =  1  C =  1000  dec_err =  strict  max_df =  0.7 , score =  0.7696100047741248
ngram =  1  C =  5000  dec_err =  strict  max_df =  0.7 , score =  0.7685313704210457
ngram =  1  C =  10000  dec_err =  strict  max_df =  0.7 , score =  0.7678233652448938
ngram =  1  C =  1000  dec_err =  ignore  max_df =  0.75 , score =  0.7696100047741248
ngram =  1  C =  5000  dec_err =  ignore  max_df =  0.75 , score =  0.7685313704210457
ngram =  1  C =  10000  dec_err =  ignore  max

ngram =  2  C =  10000  dec_err =  replace  max_df =  0.85 , score =  0.7678814586887491
ngram =  2  C =  1000  dec_err =  strict  max_df =  0.85 , score =  0.7684190271815917
ngram =  2  C =  5000  dec_err =  strict  max_df =  0.85 , score =  0.7679088370744059
ngram =  2  C =  10000  dec_err =  strict  max_df =  0.85 , score =  0.7678814586887491
ngram =  2  C =  1000  dec_err =  ignore  max_df =  0.9 , score =  0.7684190271815917
ngram =  2  C =  5000  dec_err =  ignore  max_df =  0.9 , score =  0.7679088370744059
ngram =  2  C =  10000  dec_err =  ignore  max_df =  0.9 , score =  0.7678814586887491
ngram =  2  C =  1000  dec_err =  replace  max_df =  0.9 , score =  0.7684190271815917
ngram =  2  C =  5000  dec_err =  replace  max_df =  0.9 , score =  0.7679088370744059
ngram =  2  C =  10000  dec_err =  replace  max_df =  0.9 , score =  0.7678814586887491
ngram =  2  C =  1000  dec_err =  strict  max_df =  0.9 , score =  0.7684190271815917
ngram =  2  C =  5000  dec_err =  strict  

In [89]:
log_params

{'score': 0.7696100047741248,
 'ngram_range': 1,
 'C': 1000,
 'max_df': 0.7,
 'dec_err': 'ignore'}

In [90]:
vectorizer_clean_title = TfidfVectorizer(ngram_range = (1, 1), max_df = 0.7, decode_error = 'ignore', sublinear_tf = True)
vectorizer_clean_title.fit(X_train_clean_title)
X_train_clean_title1 = vectorizer_clean_title.transform(X_train_clean_title)
X_test_clean_title1 = vectorizer_clean_title.transform(X_test_clean_title)

In [91]:
X_train_clean_title1.shape

(7044, 13770)

In [92]:
title_clean_model = LogisticRegression(C = 1000)
title_clean_model.fit(X_train_clean_title1, np.array(y_train))
title_clean_pred = title_clean_model.predict_proba(X_test_clean_title1)

# Модель на сыром тексте:

In [93]:
def extract_raw_info(file):
    tmp = open(file, "r", encoding="utf-8")
    num_str = -1
    for elem in tmp:
        num_str+=1
    tmp.close()
    X = []
    y = []
    raw_data = open(file, "r", encoding="utf-8")
    raw_data.readline()
    for elem in tqdm(range(num_str)):
        elem = raw_data.readline()
        raw_info = elem.strip().split('\t')
        cur_ans = int(raw_info[1])
        cur_txt = raw_info[3]
        X.append(b64decode(cur_txt).decode("utf-8", errors="replace"))
        y.append(cur_ans)
    raw_data.close()
    return X, y

In [94]:
X_train_raw, y_train = extract_raw_info('kaggle_train_data_tab.csv')
X_test_raw, y_test = extract_raw_info('kaggle_test_data_tab.csv')

100%|██████████| 7044/7044 [00:03<00:00, 2338.84it/s]
100%|██████████| 16039/16039 [00:07<00:00, 2005.28it/s]


In [95]:
y_train = np.array(y_train)

In [96]:
vectorizer = TfidfVectorizer(ngram_range = (1, 4), sublinear_tf = True)
vectorizer.fit(X_train_raw)
X_train_raw1 = vectorizer.transform(X_train_raw)
X_test_raw1 = vectorizer.transform(X_test_raw)

In [112]:
X_train_raw1.shape

(7044, 33461229)

In [120]:
Folds = KFold(n_splits = 7)
sgd_params = dict()
sgd_params['loss'] = None
sgd_params['penalty'] = None
sgd_params['alpha'] = None
sgd_params['score'] = 0
loss = ['log', 'modified_huber']
penal = ['elasticnet', 'l2']
alpha = [0.0001, 0.001, 0.01]
for ls in loss:
    for pt in penal:
        for alph in alpha:
            total_res = 0
            for train_id , test_id in Folds.split(X_train_raw1):
                print(1)
                X1 = X_train_raw1[train_id]
                X2  = X_train_raw1[test_id]
                y1 = y_train[train_id]
                y2 = y_train[test_id]
                clf = SGDClassifier(loss = ls, penalty = pt, alpha = alph)
                clf.fit(X1, y1)
                y_pred = clf.predict(X2)
                total_res += f1_score(y_pred, y2)
            total_res /= 7
            print('loss = ', ls, ' penalty = ', pt, ' alpha = ', alph, ', score = ', total_res)
            if total_res > sgd_params['score']:
                sgd_params['score'] = total_res
                sgd_params['loss'] = ls
                sgd_params['penalty'] = pt
                sgd_params['alpha'] = alph

1
1
1
1
1
1
1
loss =  log  penalty =  elasticnet  alpha =  0.0001 , score =  0.9643466036742877
1
1
1
1
1
1
1
loss =  log  penalty =  elasticnet  alpha =  0.001 , score =  0.7496626158470496
1
1
1
1
1
1
1
loss =  log  penalty =  elasticnet  alpha =  0.01 , score =  0.7366815589878054
1
1
1
1
1
1
1
loss =  log  penalty =  l2  alpha =  0.0001 , score =  0.9771749519353821
1
1
1
1
1
1
1
loss =  log  penalty =  l2  alpha =  0.001 , score =  0.9323295861769937
1
1
1
1
1
1
1
loss =  log  penalty =  l2  alpha =  0.01 , score =  0.7412697880049721
1
1
1
1
1
1
1
loss =  modified_huber  penalty =  elasticnet  alpha =  0.0001 , score =  0.9828205384115307
1
1
1
1
1
1
1
loss =  modified_huber  penalty =  elasticnet  alpha =  0.001 , score =  0.9558897445196104
1
1
1
1
1
1
1
loss =  modified_huber  penalty =  elasticnet  alpha =  0.01 , score =  0.7366815589878054
1
1
1
1
1
1
1
loss =  modified_huber  penalty =  l2  alpha =  0.0001 , score =  0.9851054833847284
1
1
1
1
1
1
1
loss =  modified_huber 

In [121]:
sgd_params

{'loss': 'modified_huber',
 'penalty': 'l2',
 'alpha': 0.0001,
 'score': 0.9851054833847284}

In [122]:
clf_alt = SGDClassifier(loss = 'modified_huber', penalty = 'l2', alpha = 0.0001)
clf_alt.fit(X_train_raw1, y_train)
res_alt = clf_alt.predict_proba(X_test_raw1)

# Финальное голосование

In [129]:
#res_alt - результаты фичей
#res - сырой текст
#url_pred - сырые урлы
#url_clean_pred - очищенные урлы
#title_clean_pred очищенные заголовки
#сырые анкоры
#anchor_clean_predочищенные анкоры

In [131]:
predictions = np.array(((res + res_alt + url_clean_pred + anchor_clean_pred+title_clean_pred)/5)[:,1] >= 0.519, dtype=int)##best

In [132]:
with open('ensemble_clean_last.csv', 'wt') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    for i in range(len(ids)):
        prediction = predictions[i]
        writer.writerow([ids[i], int(prediction)])