In [1]:
%load_ext autoreload

In [2]:
import sys

In [3]:
%autoreload

import json
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import tqdm

  from tqdm.autonotebook import tqdm, trange
2024-09-30 05:15:33.132357: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-30 05:15:33.135967: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-30 05:15:33.174296: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-30 05:15:33.174317: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-30 05:15:33.174339: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable t

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
def author_pair_similarity(documents_ids, documents):
    #print(documents)
    document_embeddings = model.encode(documents)
    #print(document_embeddings)
    pairwise_sim = cosine_similarity(document_embeddings, dense_output=True)
    #print(pairwise_sim)
    docs_sims = {}
    for i, di in enumerate(documents_ids):
        for j, dj in enumerate(documents_ids):
            if j <= i:
                continue
            docs_sims[(di, dj)] = pairwise_sim[i,j]
            
    return docs_sims

In [6]:
def extract_split_similarity_info(path, split, author_clm="authorIDs", max_sim=0.2, max_sample_size=10000):
    df_paths = Path(ds_path).glob("{}*.jsonl".format(split))
    df_paths = [path for path in df_paths if 'filtered' not in str(path)]
    print(df_paths)
    
    dfs = {p: pd.read_json(p, lines=True) for p in df_paths}
    
    for p, df in dfs.items():
        df['authorID'] = df[author_clm].apply(lambda x:x[0])    
    
    df = pd.concat(list(dfs.values()))
    
    gdf = df.groupby('authorID').agg({'documentID': lambda x: list(x), 'fullText': lambda x: list(x)}).reset_index()
    gdf = gdf[gdf.documentID.str.len() > 1]

    gdf_sample = gdf #gdf.sample(10)
    pairwise_sims = []
    for idx, row in tqdm.tqdm(gdf_sample.iterrows(),):
        pairwise_sims.append(author_pair_similarity(row['documentID'], row['fullText']))

    gdf_sample['pairwise_sims'] = pairwise_sims
    gdf_sample['max_sim'] = gdf_sample.pairwise_sims.apply(lambda sims: sorted(sims.items(), key=lambda x: x[1])[-1][1])
    gdf_sample = gdf_sample[['authorID', 'pairwise_sims', 'max_sim']]
    gdf_sample = gdf_sample[gdf_sample.max_sim < 0.2]
    filtered_authors = gdf_sample.authorID.tolist()

    for p, df in dfs.items():
        print('{} --> {}'.format(len(df), len(df[df.authorID.isin(filtered_authors)])))
        df = df[df.authorID.isin(filtered_authors)]
        df = df.sample(max_sample_size) if len(df) > max_sample_size else df
        with open(str(p).replace('.jsonl','_filtered.jsonl'), "w") as f:
            f.write(df.to_json(orient='records', lines=True))

    gdf_sample.to_json(path + '/' + split + '_info.json')
    return gdf_sample

In [23]:
#ds_path = '/mnt/swordfish-pool2/milad/hiatus-data/performers-data/tmp-data/'
ds_path = '/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/'
df_info = extract_split_similarity_info(ds_path, 'dev', max_sample_size=5000)
df_info = extract_split_similarity_info(ds_path, 'test', max_sample_size=5000)
df_info = extract_split_similarity_info(ds_path, 'train', max_sample_size=10000)

[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/dev_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/dev_queries.jsonl')]


33538it [03:01, 184.52it/s]


33538 --> 13582
33538 --> 13582
[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/test_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/test_queries.jsonl')]


65102it [05:52, 184.68it/s]


65102 --> 26435
65103 --> 26435
[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/train_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/gmane/train_queries.jsonl')]


558885it [50:15, 185.34it/s]


558926 --> 225898
558918 --> 225898


In [25]:
#ds_path = '/mnt/swordfish-pool2/milad/hiatus-data/performers-data/tmp-data/'
ds_path = '/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/'
df_info = extract_split_similarity_info(ds_path, 'dev', max_sample_size=5000)
df_info = extract_split_similarity_info(ds_path, 'test', max_sample_size=5000)
df_info = extract_split_similarity_info(ds_path, 'train', max_sample_size=10000)

[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/dev_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/dev_queries.jsonl')]


3379it [00:31, 107.96it/s]


3379 --> 437
3379 --> 437
[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/test_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/test_queries.jsonl')]


6560it [00:53, 122.42it/s]


6560 --> 891
6560 --> 891
[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/train_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/bookcorpus/train_queries.jsonl')]


56319it [07:40, 122.18it/s]


56319 --> 7469
56319 --> 7469


In [26]:
#ds_path = '/mnt/swordfish-pool2/milad/hiatus-data/performers-data/tmp-data/'
ds_path = '/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews'
df_info = extract_split_similarity_info(ds_path, 'dev', max_sample_size=5000)
df_info = extract_split_similarity_info(ds_path, 'test', max_sample_size=5000)
df_info = extract_split_similarity_info(ds_path, 'train', max_sample_size=10000)

[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/dev_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/dev_queries.jsonl')]


10861it [01:45, 103.29it/s]


10861 --> 2847
10861 --> 2847
[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/test_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/test_queries.jsonl')]


21085it [03:25, 102.75it/s]


21085 --> 5476
21085 --> 5476
[PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/train_candidates.jsonl'), PosixPath('/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/realnews/train_queries.jsonl')]


181024it [29:26, 102.49it/s]


181024 --> 46617
181024 --> 46617


In [None]:
#ds_path = '/mnt/swordfish-pool2/milad/hiatus-data/performers-data/tmp-data/'
ds_path = '/burg/old_dsi/users/ma4608/hiatus_performers_data/sadiri/ao3/'
df_info = extract_split_similarity_info(ds_path, 'dev')
df_info = extract_split_similarity_info(ds_path, 'test')
df_info = extract_split_similarity_info(ds_path, 'train')