In [None]:
! pip install pyod

In [8]:
import pandas as pd
import numpy as np
from pyod.models.copod import COPOD
from sklearn.metrics import f1_score, pairwise_distances

In [3]:
def ngrams(text):
  text = text.lower()
  text_pairs = []
  text_halfs = []
  text = text.split()

  # добавляем пары слов
  for i in range(len(text) - 1):
    text_pairs.append(text[i]+text[i+1])
  
  # добавляем половинки слов
  for word in text:
    size = len(word) // 2
    if size < 2:
      continue
    text_halfs.append(word[:size])
    text_halfs.append(word[size:])

  return text, text_halfs, text_pairs


def pairwise_similarity(text1, text2):

  # 0.67597
  w_half = 4
  w = 1
  w_pair = 4
  
  text1, text1_halfs, text1_pairs = ngrams(text1)
  text2, text2_halfs, text2_pairs = ngrams(text2)

  if len(text1) == 0 and len(text1) == 0:
    return 0.

  initial_intersection =  len(set(text1) & set(text2))
  # print(initial_intersection, set(text1), set(text2))
  
  halfs_intersection = len(set(text1_halfs) & set(text2_halfs))
  # print(halfs_intersection, set(text1_halfs), set(text2_halfs))
  
  pairs_intersection = len(set(text1_pairs) & set(text2_pairs))
  # print(pairs_intersection, set(text1_pairs), set(text2_pairs))

  return int(200 * (w_half * halfs_intersection + w_pair * pairs_intersection + w * initial_intersection) / (len(text1) + len(text2)))


In [5]:
doc_to_title = {}
with open('docs_titles.tsv') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = data[1]
        doc_to_title[doc_id] = title
len(doc_to_title)

28026

In [6]:
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [7]:
test_data = pd.read_csv('test_groups.csv')
testgroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))

In [15]:
N = 25

y_train = []
X_train = []

for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, target_id) in enumerate(docs):
        # print(title)
        y_train.append(target_id)
        all_dist = []
        # words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, target_j = docs[j]
            # words_j = set(title_j.strip().split())
            all_dist.append(pairwise_similarity(title, title_j))
            
        X_train.append(sorted(all_dist, reverse=True)[0:N])

X_train = np.array(X_train)
y_train = np.array(y_train)
X_train.shape, y_train.shape, groups_train.shape

((11690, 25), (11690,), (11690,))

In [16]:
X_test_id = []
X_test = []

for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    for k, (doc_id, title) in enumerate(docs):
        all_dist = []
        # words = set(title.strip().split())
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j = docs[j]
            # words_j = set(title_j.strip().split())
            all_dist.append(pairwise_similarity(title, title_j))
            
        X_test.append(sorted(all_dist, reverse=True)[0:N])
        X_test_id.append(doc_id)
        

X_test = np.array(X_test)
X_test_id = np.array(X_test_id)
print (X_test.shape)

(16627, 25)


In [17]:
cpd = COPOD(contamination=0.25)

cpd.fit(X_train[y_train == 0])
ans_cpd = cpd.predict(X_train)

f1_score(ans_cpd, y_train)

0.679080265199605

In [18]:
test_ans = cpd.predict(X_test)
test_ans.shape, X_test_id.shape

((16627,), (16627,))

In [19]:
d = {}

for i in range(16627):
  d[X_test_id[i]] = test_ans[i]

In [20]:
test_target = []

for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    id = new_doc['doc_id']
    test_target.append(d[id])

In [21]:
test_data['target'] = test_target
test_data = test_data[['pair_id', 'target']]

In [22]:
test_data.to_csv('cpd_new.csv', index=False)