In [3]:
from pymystem3 import Mystem
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import html2text
import os
import bs4
import pickle

In [6]:
doc_to_title = {}
stem = Mystem()
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        if len(data) > 1:
            # print(data[1])
            data[1] = ''.join(stem.lemmatize(data[1])).replace('\n', '')
            # print(data[1])
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
        
print (len(doc_to_title))

28026


In [136]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

y_train_lin = []
X_train_lin = []
groups_train = []
mean = []
median = []
std = []

for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, target_id) in enumerate(docs):
        y_train_lin.append(target_id)
        groups_train.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, target_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
            
        all_dist = sorted(all_dist, reverse=True)
        mean.append(np.mean(np.array(all_dist)))
        median.append(np.median(np.array(all_dist)))
        std.append(np.std(np.array(all_dist)))
        X_train_lin.append(all_dist[0:15])
        
X_train_lin = np.array(X_train_lin)
X_train_lin = np.hstack((X_train_lin, np.array(mean)[:, np.newaxis]))
X_train_lin = np.hstack((X_train_lin, np.array(median)[:, np.newaxis]))
X_train_lin = np.hstack((X_train_lin, np.array(std)[:, np.newaxis]))
y_train_lin = np.array(y_train_lin)
groups_train = np.array(groups_train)
print (X_train_lin.shape, y_train_lin.shape, groups_train.shape)


(11690, 18) (11690,) (11690,)


In [137]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
doc_id_test = []
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    doc_id_test.append(new_doc['pair_id'])
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))

X_test_lin = []
groups_test = []
mean = []
median = []
std = []

for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    for k, (doc_id, title) in enumerate(docs):
        groups_test.append(new_group)
        all_dist = []
        words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j = docs[j]
            words_j = set(title_j.strip().split())
            all_dist.append(len(words.intersection(words_j)))
        
        all_dist = sorted(all_dist, reverse=True)
        mean.append(np.mean(np.array(all_dist)))
        median.append(np.median(np.array(all_dist)))
        std.append(np.std(np.array(all_dist)))
        X_test_lin.append(all_dist[0:15])
        
X_test_lin = np.array(X_test_lin)
X_test_lin = np.hstack((X_test_lin, np.array(mean)[:, np.newaxis]))
X_test_lin = np.hstack((X_test_lin, np.array(median)[:, np.newaxis]))
X_test_lin = np.hstack((X_test_lin, np.array(std)[:, np.newaxis]))
groups_test = np.array(groups_test)
print (X_test_lin.shape, groups_test.shape, len(doc_id_test))

'''
scaler = StandardScaler()
X_train_lin = scaler.fit_transform(X_train_lin)
scaler = StandardScaler()
X_test_lin = scaler.fit_transform(X_test_lin)
'''

(16627, 18) (16627,) 16627


'\nscaler = StandardScaler()\nX_train_lin = scaler.fit_transform(X_train_lin)\nscaler = StandardScaler()\nX_test_lin = scaler.fit_transform(X_test_lin)\n'

In [9]:
from bs4 import BeautifulSoup
from bs4.element import Comment


m = Mystem()

def get_normalized(text):
    norm = list()
    lemmas = m.lemmatize(text)
    for lemma in lemmas:
        if lemma.isalnum():
            norm.append(lemma)
    return norm


def write_normalized(directory, name, normalized):
    if not os.path.exists(directory):
        os.mkdir(directory)
    if not directory.endswith('/'):
        directory += '/'
    with open(directory + name, 'w') as f:
        for line in normalized:
            f.write(line + '\n')


def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def normalize_html_files(directory):
    for r, d, f in os.walk(directory):
        for file in f:
            with open(directory + file) as html:
                link = html.readline()[:-1]
                name = os.path.basename(html.name)
                html_data = html.read()
            text = text_from_html(html_data)
            text = text.encode('utf-8', errors='surrogatepass').decode('utf-8', 'replace')
            normalized = get_normalized(text)
            # normalized.insert(0, link)
            write_normalized('normalized/', name, normalized)            

In [None]:
# normalize_html_files('content/')
# do not touch!!!

In [211]:
def idf(directory, traingroups_titledata):
    df_dict = {}
    
    for key in traingroups_titledata:
        df_dict[key] = {}
        count = 0
        for data in traingroups_titledata[key]:
            count += 1
            with open(directory + str(data[0]) + '.dat', 'r') as file:
                s = set()
                for line in file:
                    line = line[:-1]
                    s.update([line])

                for line in s:
                    if len(line) <= 2:
                        continue
                    elif line in df_dict[key]:
                        df_dict[key][line] += 1

                    else:
                        df_dict[key][line] = 1
                            
        for elem in df_dict[key]:            
            df_dict[key][elem] = count / df_dict[key][elem]
            df_dict[key][elem] = np.log(df_dict[key][elem])
            
                        
    return df_dict

In [212]:
def vector_components(directory, traingroups_titledata):
    vector_set = {}
    the_dict = {}
    
    for key in traingroups_titledata:
        the_dict[key] = {}
        for data in traingroups_titledata[key]:
           # if data[2] == 1:
            with open(directory + str(data[0]) + '.dat', 'r') as file:
                for line in file:
                    line = line[:-1]
                    if len(line) <= 2:
                        continue
                    elif line in the_dict[key]:
                        the_dict[key][line] += 1
                    else:
                        the_dict[key][line] = 1
        #print(the_dict[key])
                            
    for key in the_dict:
        l = []
        for elem in the_dict[key]:
            l.append((the_dict[key][elem], elem))
            
        l = list(reversed(sorted(l)))
        length = len(l)
        first = 0 #length // 100
        second = length - length // 5
        
        l = l[first:second]
        ll = []
        for el in l:
            if el[0] <= 5:
                break
            ll.append(el[1])
        
        s = set(ll)
        vector_set[key] = {a: i for a, i in zip(ll, range(len(ll)))}
                            
    return vector_set

In [259]:
def count_tfidf_by_groups(directory, traingroups_titledata, idf_dictionary, vector_for_tfidf):
    tfidf_dict = {} 
    words_lin = {}
    
    for key in traingroups_titledata: # by groups
        dim1 = len(traingroups_titledata[key])
        dim2 = len(vector_for_tfidf[key])
        l = np.zeros(dim1)
        matr = np.zeros((dim1, dim2))
        words = {}
        words_lin[key] = {}
        group_len = 0
        
        for i, elem in enumerate(traingroups_titledata[key]): # by tuple (doc_id, title, target)
            l[i] = elem[0]
            words_doc = set()
            all_dists = []
            count = 0
            
            with open(directory + str(elem[0]) + '.dat', "r") as file: # by matr string ~ document tf-idf
                for line in file:
                    count += 1
                    line = line[:-1]
                    if line in vector_for_tfidf[key]:
                        ind = vector_for_tfidf[key][line]
                        matr[i][ind] += 1
                        words_doc.update([line])
                        
            
            group_len += count            
            words[elem[0]] = words_doc

            if count != 0:
                matr[i] /= count

            for j, word in enumerate(vector_for_tfidf[key]):
                matr[i][j] *= idf_dictionary[key][word]
            
        for doc_id in words:
            all_dist = []
            for doc_id_second in words:
                if doc_id == doc_id_second:
                    continue
                all_dist.append(len(words[doc_id].intersection(words[doc_id_second])))
            all_dist = np.array(all_dist)
            all_dist = all_dist / group_len
            words_lin[key][doc_id] = sorted(all_dist, reverse=True)[0:20]
        
        
                    
        tfidf_dict[key] = (l, matr)
    
    return tfidf_dict, words_lin

In [214]:
from sklearn import metrics

def cos_metric(tfidf_dict, traingroups_titledata):
    ans = {}
    
    for key in tfidf_dict:
        group_sum = np.zeros(tfidf_dict[key][1].shape[1])
        for i, elem in enumerate(traingroups_titledata[key]):
            vec = tfidf_dict[key][1][np.argwhere(tfidf_dict[key][0] == elem[0])[0][0]]
            group_sum += vec
        
        cos_closest = metrics.pairwise.cosine_distances(tfidf_dict[key][1], tfidf_dict[key][1])
        
        ans[key] = {}
        for elem in tfidf_dict[key][0]:
            doc_id = elem
            ans1 = 0
            ans2 = 0
            ans3 = 0
            ans4 = 0
            ans5 = 0
            ans6 = 0
            
            vec = tfidf_dict[key][1][np.argwhere(tfidf_dict[key][0] == elem)[0][0]]
            
            ans1 = metrics.pairwise.cosine_distances([group_sum], [vec])[0][0]
                
            ans2 = np.min(cos_closest[np.nonzero(cos_closest)])
            ans3 = np.max(cos_closest[np.nonzero(cos_closest)])
            ans4 = np.mean(cos_closest[np.nonzero(cos_closest)])
            ans5 = np.median(cos_closest[np.nonzero(cos_closest)])
            
            ans[key][doc_id] = (ans1, ans2, ans3, ans4, ans5)
        
    return ans

In [215]:
def count_tfidf_by_groups_for_headers(traingroups_titledata):
    df_dict = {}
    
    for key in traingroups_titledata:
        df_dict[key] = {}
        count = 0
        for data in traingroups_titledata[key]:
            count += 1
            s = set()
            for line in data[1].split():
                s.update([line])

            for line in s:
                if len(line) <= 2:
                    continue
                elif line in df_dict[key]:
                    df_dict[key][line] += 1

                else:
                    df_dict[key][line] = 1
                            
        for elem in df_dict[key]:            
            df_dict[key][elem] = count / df_dict[key][elem]
            df_dict[key][elem] = np.log(df_dict[key][elem])
            
    print("df finished")
    
    vector_set = {}
    the_dict = {}
    
    for key in traingroups_titledata:
        the_dict[key] = {}
        for data in traingroups_titledata[key]:
            for line in data[1].split():
                if len(line) <= 2:
                    continue
                elif line in the_dict[key]:
                    the_dict[key][line] += 1
                else:
                    the_dict[key][line] = 1
                            
    for key in the_dict:
        l = []
        for elem in the_dict[key]:
            l.append((the_dict[key][elem], elem))
            
        l = list(reversed(sorted(l)))
        length = len(l)
        first = 0 #length // 100
        second = length
        
        l = l[first:second]
        ll = []
        for el in l:
            if el[0] <= 5:
                break
            ll.append(el[1])
        
        s = set(ll)
        vector_set[key] = {a: i for a, i in zip(ll, range(len(ll)))}
        
    print("vector set finished")
               
         
    tfidf_dict = {}
    
    for key in traingroups_titledata: # by groups
        dim1 = len(traingroups_titledata[key])
        dim2 = len(vector_set[key])
        l = np.zeros(dim1)
        matr = np.zeros((dim1, dim2))
        
        for i, elem in enumerate(traingroups_titledata[key]): # by tuple (doc_id, title, target)
            l[i] = elem[0]
            file = elem[1]# by matr string ~ document tf-idf
            count = 0
            for line in file:
                count += 1
                line = line[:-1]
                if line in vector_set[key]:
                    ind = vector_set[key][line]
                    matr[i][ind] += 1
                       
            if count != 0:
                matr[i] /= count

            for j, word in enumerate(vector_set[key]):
                matr[i][j] *= df_dict[key][word]
                            
        tfidf_dict[key] = (l, matr)
    
    print("tfidf finished")
    
    ans = {}
    
    for key in tfidf_dict:
        group_sum_headers = np.zeros(tfidf_dict[key][1].shape[1])
        for i, elem in enumerate(traingroups_titledata[key]):
            vec = tfidf_dict[key][1][np.argwhere(tfidf_dict[key][0] == elem[0])[0][0]]
            group_sum_headers += vec
        
        cos_closest = metrics.pairwise.cosine_distances(tfidf_dict[key][1], tfidf_dict[key][1])
        
        ans[key] = {}
        for i, elem in enumerate(tfidf_dict[key][0]):
            doc_id = elem
            ans1 = 0
            ans2 = 0
            ans3 = 0
            ans4 = 0
            ans5 = 0
            ans6 = 0
            vec = tfidf_dict[key][1][np.argwhere(tfidf_dict[key][0] == elem)[0][0]]

            ans1 = metrics.pairwise.cosine_distances([group_sum_headers], [vec])[0][0]
                
            ans2 = np.min(cos_closest[np.nonzero(cos_closest)])
            ans3 = np.max(cos_closest[np.nonzero(cos_closest)])
            ans4 = np.mean(cos_closest[np.nonzero(cos_closest)])
            ans5 = np.median(cos_closest[np.nonzero(cos_closest)])
            
            ans[key][doc_id] = (ans1, ans2, ans3, ans4, ans5)
        
    return ans

In [216]:
idf_dictionary = idf("normalized/", traingroups_titledata)
print('idf finished')

idf finished


In [217]:
vector_for_tfidf = vector_components("normalized/", traingroups_titledata)
print('vector finished')

vector finished


In [244]:
tfidf_dict, words_lin = count_tfidf_by_groups("normalized/", traingroups_titledata, idf_dictionary, vector_for_tfidf)
print('tfidf finished')

tfidf finished


In [245]:
dict1 = cos_metric(tfidf_dict, traingroups_titledata)
print('dict1 finished')

dict1 finished


In [220]:
dict1_headers = count_tfidf_by_groups_for_headers(traingroups_titledata)
print('dict2 finished')

df finished
vector set finished
tfidf finished
dict2 finished


In [248]:
train_data = pd.read_csv('train_groups.csv')
pair_id_mapping = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    pair_id = new_doc['pair_id']
    pair_id_mapping[pair_id] = (doc_group, doc_id, target)

In [249]:
N = 5
M = 5
K = 20
X_train = np.zeros((len(pair_id_mapping), N + M + K))
y_train = np.zeros(len(pair_id_mapping))
for i, key in enumerate(pair_id_mapping):
    a, b, c, d, e = dict1[pair_id_mapping[key][0]][pair_id_mapping[key][1]]
    f, g, h, j, k = dict1_headers[pair_id_mapping[key][0]][pair_id_mapping[key][1]]
    X_train[i][0:N] = [a, b, c, d, e]
    X_train[i][N:N+M] = [f, g, h, j, k]
    X_train[i][N+M:N+M+K] = words_lin[pair_id_mapping[key][0]][pair_id_mapping[key][1]]
    y_train[i] = pair_id_mapping[key][2]

In [223]:
X_train

array([[7.00821138e-01, 3.90644029e-03, 9.99558864e-01, ...,
        3.97296882e-04, 3.89800714e-04, 3.82304547e-04],
       [6.55728611e-01, 3.90644029e-03, 9.99558864e-01, ...,
        6.40922328e-04, 6.03441491e-04, 5.39724066e-04],
       [7.47949984e-01, 3.90644029e-03, 9.99558864e-01, ...,
        3.18587122e-04, 3.18587122e-04, 3.14839039e-04],
       ...,
       [7.78696003e-01, 3.94707860e-04, 9.99760166e-01, ...,
        1.17632951e-03, 1.17265118e-03, 1.15867354e-03],
       [5.27740920e-01, 3.94707860e-04, 9.99760166e-01, ...,
        5.52484969e-04, 5.51749303e-04, 5.45863977e-04],
       [8.49868818e-01, 3.94707860e-04, 9.99760166e-01, ...,
        2.30263376e-04, 2.30263376e-04, 2.26585047e-04]])

In [224]:
y_train

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
глядь

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

In [225]:
idf_dictionary_test = idf("normalized/", testgroups_titledata)
print('idf finished')

idf finished


In [226]:
vector_for_tfidf_test = vector_components("normalized/", testgroups_titledata)
print('vector finished')

vector finished


In [246]:
tfidf_dict_test, words_lin_test = count_tfidf_by_groups("normalized/", testgroups_titledata, idf_dictionary_test, vector_for_tfidf_test)
print('tfidf finished')

tfidf finished


In [247]:
dict1_test = cos_metric(tfidf_dict_test, testgroups_titledata)
print('dict1 finished')

dict1 finished


In [229]:
dict1_headers_test = count_tfidf_by_groups_for_headers(testgroups_titledata)
print('dict2 finished')

df finished
vector set finished
tfidf finished
dict2 finished


In [250]:
test_data = pd.read_csv('test_groups.csv')
pair_id_mapping_test = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    pair_id = new_doc['pair_id']
    pair_id_mapping_test[pair_id] = (doc_group, doc_id)

In [251]:
X_test = np.zeros((len(pair_id_mapping_test), N + M + K))
for i, key in enumerate(doc_id_test):
    a, b, c, d, e = dict1_test[pair_id_mapping_test[key][0]][pair_id_mapping_test[key][1]]
    f, g, h, j, k = dict1_headers_test[pair_id_mapping_test[key][0]][pair_id_mapping_test[key][1]]
    X_test[i][0:N] = [a, b, c, d, e]
    X_test[i][N:N+M] = [f, g, h, j, k]
    X_test[i][N+M:N+M+K] = words_lin_test[pair_id_mapping_test[key][0]][pair_id_mapping_test[key][1]]

In [252]:
X_train_final = np.hstack((X_train, X_train_lin))
y_train_final = y_train

In [253]:
X_test_final = np.hstack((X_test, X_test_lin))

In [254]:
scaler = StandardScaler()
X_train_final = scaler.fit_transform(X_train_final)
scaler = StandardScaler()
X_test_final = scaler.fit_transform(X_test_final)

In [255]:
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_train_final, y_train_final, test_size=0.35, shuffle=False)#, random_state=0, stratify=y_train)

In [256]:
clf = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=2)
clf.fit(X_train_val, y_train_val)
y_pred = clf.predict(X_test_val)
f1_score(y_test_val, y_pred)

0.6838046272493573

In [257]:
reg = []
for learning_rate in (0.005, 0.01, 0.1, 0.2): #, 0.3, 0.4, 0.5):
    for n_estimators in(50, 100, 200, 300, 400):
        for max_depth in (2, 3, 4):
            clf = GradientBoostingClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth)
            clf.fit(X_train_val, y_train_val)
            y_pred = clf.predict(X_test_val)
            f1 = f1_score(y_test_val, y_pred)
            reg.append((learning_rate, n_estimators, max_depth, f1))

In [258]:
with open('reg.csv', 'w') as f:
    f.write('learning_rate,n_estimators,max_depth,f1\n')
    for tup in reg:
        f.write('{},{},{},{}\n'.format(tup[0],tup[1],tup[2],tup[3]))
    

In [240]:
clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=400)
clf.fit(X_train_final, y_train_final)
y_pred = clf.predict(X_test_final).astype(int)

In [241]:
with open('submission.csv', 'w') as f:
    f.write('pair_id,target\n')
    for doc_id, y in zip(doc_id_test, y_pred):
        f.write('{},{}\n'.format(doc_id, y))
    

EOF