In [2]:
import json
from glob import glob
from pathlib import Path
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from nltk import jaccard_distance
from pymystem3.mystem import Mystem
import numpy as np
from tqdm.notebook import tqdm

In [3]:
def save_jsonl(json_list, output_file_path):
    with open(output_file_path, 'w', encoding="utf-8") as output_file:
        for sample in json_list:
            json_line = json.dumps(sample, ensure_ascii=False)
            #json.dumps(sample, output_file)
            output_file.write(json_line + '\n')

def read_jsonl(read_file_path):
    with open(read_file_path, encoding="utf-8") as f:
        data = [json.loads(i) for i in f]
    return data

In [4]:
work_path = Path('.')

In [5]:
all_df_path = glob(work_path.joinpath('data').joinpath('*/*.jsonl').__str__())

In [6]:
total_df = pd.concat([pd.DataFrame(read_jsonl(_path)) for _path in all_df_path]).drop_duplicates(subset=['url']).drop_duplicates(subset=['content']).reset_index(drop=True)

In [7]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,634
Забота о себе,4
Наука и техника,2711
Общество,625
Общество/Россия,5346
Силовые структуры,5438
Спорт,103
Туризм,121
Туризм/Путешествия,2907
Экономика,8026


In [8]:
total_df.loc[total_df.topic == 'Общество', 'topic'] = 'Общество/Россия'
total_df.loc[total_df.topic == 'Забота о себе', 'topic'] = 'Спорт'
total_df.loc[total_df.topic == 'Туризм', 'topic'] = 'Туризм/Путешествия'

In [9]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,634
Наука и техника,2711
Общество/Россия,5971
Силовые структуры,5438
Спорт,107
Туризм/Путешествия,3028
Экономика,8026


In [10]:
total_df.groupby('topic').agg({'id': 'count'}) / len(total_df)

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,0.024465
Наука и техника,0.104611
Общество/Россия,0.230407
Силовые структуры,0.20984
Спорт,0.004129
Туризм/Путешествия,0.116844
Экономика,0.309705


In [11]:
len(total_df)

25915

In [17]:
total_df.content.map(lambda x: len(x.split())).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])

count    25915.000000
mean       251.835346
std        277.684801
min          0.000000
5%          74.000000
25%        125.000000
50%        182.000000
75%        270.000000
95%        696.300000
max       8099.000000
Name: content, dtype: float64

In [18]:
content_size = total_df.content.map(lambda x: len(x.split()))
total_df = total_df[(content_size > 74)&(content_size < 696)].reset_index(drop=True)


In [24]:
total_df.groupby('topic').agg({'id': 'count'})

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,621
Наука и техника,2186
Общество/Россия,5484
Силовые структуры,4509
Спорт,105
Туризм/Путешествия,2876
Экономика,7528


In [23]:
total_df.groupby('topic').agg({'id': 'count'}) / len(total_df)

Unnamed: 0_level_0,id
topic,Unnamed: 1_level_1
Бывший СССР,0.026642
Наука и техника,0.093784
Общество/Россия,0.235274
Силовые структуры,0.193445
Спорт,0.004505
Туризм/Путешествия,0.123386
Экономика,0.322965


In [25]:
lemmatizer = Mystem()

In [26]:
def list_jaccard_score(target, texts):
    return [jaccard_distance(set(target), set(i)) for i in texts]

In [27]:
def jaccard_duplicate(seq_of_lem_text):
    unique_news_group = np.array([False]*len(seq_of_lem_text))
    iterator = tqdm(enumerate(seq_of_lem_text), total=len(seq_of_lem_text))
    for n, lem_text in iterator:
        if unique_news_group[n]:
            continue        
        jacc_scores = np.array(list_jaccard_score(lem_text, seq_of_lem_text[n+1:]))
        unique_news_group[n+1:] += jacc_scores <= 0.15
    return unique_news_group
        

In [28]:
lem_texts = []
texts = total_df.content
iterator = tqdm(texts, total=len(texts))
for text in iterator:
    lem_texts.append(lemmatizer.lemmatize(text))

  0%|          | 0/23309 [00:00<?, ?it/s]

In [29]:
duplicated = jaccard_duplicate(lem_texts)

  0%|          | 0/23309 [00:00<?, ?it/s]

In [30]:
total_df[duplicated == False].to_csv(work_path.absolute().parent.joinpath('train/full_data.csv').__str__(), index=False)