In [125]:
import json
from glob import glob
from pathlib import Path
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from nltk import jaccard_distance
from pymystem3.mystem import Mystem
import numpy as np
from tqdm.notebook import tqdm

In [126]:
def save_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding="utf-8") as output_file:
        for sample in json_list:
            json_line = json.dumps(sample, ensure_ascii=False)
            #json.dumps(sample, output_file)
            output_file.write(json_line + '\n')

def read_jsonl(read_file_path):
    with open(read_file_path, encoding="utf-8") as f:
        data = [json.loads(i) for i in f]
    return data

In [127]:
work_path = Path('.')

In [128]:
all_df_path = glob(work_path.joinpath('data').joinpath('*/*.jsonl').__str__())

In [129]:
total_df = pd.concat([pd.DataFrame(read_jsonl(_path)) for _path in all_df_path]).drop_duplicates(subset=['url']).drop_duplicates(subset=['content']).reset_index(drop=True)

In [130]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,634
Забота о себе,4
Наука и техника,2706
Общество,625
Общество/Россия,5361
Силовые структуры,5443
Спорт,103
Туризм,121
Туризм/Путешествия,2892
Экономика,8026


In [131]:
total_df.loc[total_df.topic == 'Общество', 'topic'] = 'Общество/Россия'
total_df.loc[total_df.topic == 'Забота о себе', 'topic'] = 'Спорт'
total_df.loc[total_df.topic == 'Туризм', 'topic'] = 'Туризм/Путешествия'

In [132]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,634
Наука и техника,2706
Общество/Россия,5986
Силовые структуры,5443
Спорт,107
Туризм/Путешествия,3013
Экономика,8026


In [133]:
total_df.groupby('topic').agg({'id': 'count'}) / len(total_df)

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,0.024465
Наука и техника,0.104418
Общество/Россия,0.230986
Силовые структуры,0.210033
Спорт,0.004129
Туризм/Путешествия,0.116265
Экономика,0.309705


In [134]:
len(total_df)

25915

In [145]:
total_df = total_df.loc[:1000]

In [146]:
lemmatizer = Mystem()

In [147]:
def list_jaccard_score(target, texts):
    return [jaccard_distance(set(target), set(i)) for i in texts]

In [148]:
def jaccard_duplicate(seq_of_lem_text):
    unique_news_group = np.array([False]*len(seq_of_lem_text))
    iterator = tqdm(enumerate(seq_of_lem_text), total=len(seq_of_lem_text))
    for n, lem_text in iterator:
        if unique_news_group[n]:
            continue        
        jacc_scores = np.array(list_jaccard_score(lem_text, seq_of_lem_text[n+1:]))
        unique_news_group[n+1:] += jacc_scores <= 0.15
    return unique_news_group
        

In [149]:
lem_texts = []
texts = total_df.content
iterator = tqdm(texts, total=len(texts))
for text in iterator:
    lem_texts.append(lemmatizer.lemmatize(text))

  0%|          | 0/1001 [00:00<?, ?it/s]

In [150]:
duplicated = jaccard_duplicate(lem_texts)

  0%|          | 0/1001 [00:00<?, ?it/s]

In [158]:
total_df[duplicated == False].to_csv(work_path.absolute().parent.joinpath('train/full_data.csv').__str__(), index=False)