In [3]:
!pip install --user sympy



In [4]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
import torch
import sympy
import os
import logging
from tqdm import tqdm
tqdm.pandas(miniters=100000, mininterval=60, maxinterval=600)

logging.basicConfig(level=logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [133]:
def get_primes(lower_bound=100, upper_bound=1000, step=10000) :
    for i in range(lower_bound,upper_bound) :
        yield list(sympy.sieve.primerange(i*step,(i+1)*step)) #500000, 750000))
#     yield sympy.sieve.primerange(500000, 750000)

In [65]:
def index_to_int(index):
    # index = [str(i) if i in str_nb else str(ord(i) - 97) for i in index]
    # return int(''.join(index))
    if type(index) == list:
        return [int(i) for i in index]
    else :
        return int(index)

In [66]:
def get_factor(index):
    assert primes != []
    factors = []
    for prime in primes:
        if index % prime == 0:
            factors.append(prime)
    for f in factors:
        primes.remove(f)
    return factors

In [67]:
def reduce_index(index, smallest_prime):
    if type(index) == int :
        return index % smallest_prime
    elif type(index) == list :
        return [i%smallest_prime for i in index]

In [5]:
ids = []
for i in tqdm(range(5)):
    logging.info(f"Load ../../../data-set_pre_processed/fold-{i}/corpus_train.json")
    ids += pd.read_json(f"../../../data-set_pre_processed/fold-{i}/corpus_train.json",
                        dtype={'id': str})["id"].to_list()

  0%|          | 0/5 [00:00<?, ?it/s]INFO:root:Load ../../../data-set_pre_processed/fold-0/corpus_train.json
 20%|██        | 1/5 [00:05<00:23,  5.83s/it]INFO:root:Load ../../../data-set_pre_processed/fold-1/corpus_train.json
 40%|████      | 2/5 [00:11<00:16,  5.63s/it]INFO:root:Load ../../../data-set_pre_processed/fold-2/corpus_train.json
 60%|██████    | 3/5 [00:16<00:11,  5.54s/it]INFO:root:Load ../../../data-set_pre_processed/fold-3/corpus_train.json
 80%|████████  | 4/5 [00:22<00:05,  5.49s/it]INFO:root:Load ../../../data-set_pre_processed/fold-4/corpus_train.json
100%|██████████| 5/5 [00:27<00:00,  5.52s/it]


In [6]:
ids = [int(i) for i in ids]
logging.info(f"len(ids) = {len(ids)}")

INFO:root:len(ids) = 5154505


In [11]:
for i in tqdm(ids) :
    if i % 2147483647 == 0 :
        print(i)
smallest_prime = 2147483647

100%|██████████| 5154505/5154505 [00:01<00:00, 2929116.64it/s]


In [134]:
# def main():
smallest_prime=None
if smallest_prime is None:
    primes_gen = get_primes()
    smallest_prime = -1
    j = 0
    for primes in primes_gen :
        logging.info(f"primes : {primes[0]} ---> {primes[-1]}")
        if j < 0 :
            break
        j += 1
        for prime in tqdm(primes) :
            remainder = ids % prime
            if any(remainder == 0) :
                continue
            else :
                smallest_prime = int(prime)
                logging.info(f"smallest_prime : {smallest_prime}")
                j = -1
                break
        if j < 0 :
            break

if smallest_prime == -1 :
    exit(1)

INFO:root:primes : 1000003 ---> 1009997
  1%|          | 8/753 [00:04<07:13,  1.72it/s]INFO:root:smallest_prime : 1000133
  1%|          | 8/753 [00:05<08:10,  1.52it/s]


In [14]:
files_name = []
for suffix_name in [
                  "corpus_train.json",
                  "articles_train.json",
                  "sections_train.json"
                  ] :
    files_name += [f"../../../data-set_pre_processed/fold-{i}/{suffix_name}" for i in range(5)]

In [15]:
for file_name in files_name :
    logging.info(f"Processing {file_name}...")
    df = pd.read_json(file_name)
    if "corpus" in file_name :
        # logging.info(r"sanity check : all id in df are in ids (list)")
        # assert np.array_equal(df["id"].apply(int).to_numpy(), ids[:len(df)])
        df["new_id"] = df["id"].apply(lambda x : int(x) % smallest_prime)
        logging.info(r"sanity check : all id % smallest_prime != 0")
        assert all(df["new_id"] != 0)
    else :
        df["new_id"] = df["id"].apply(lambda x_list : [int(x) % smallest_prime for x in x_list])
        logging.info(r"sanity check : all id % smallest_prime != 0")
        assert all(df["new_id"].apply(lambda x_list : all([x != 0 for x in x_list])))
    logging.info(f"Sanity check passed.")
    logging.info(f"save file as {file_name}.new.json.")
#     df.to_json(file_name+".new.json", indent=True)
    logging.info("saved.\n\n")
logging.info(f"DONE.\n\n")

INFO:root:Processing ../../../data-set_pre_processed/fold-0/articles_train.json...
INFO:root:sanity check : all id % smallest_prime != 0
INFO:root:Sanity check passed.
INFO:root:save file as ../../../data-set_pre_processed/fold-0/articles_train.json.new.json.
INFO:root:saved.


INFO:root:Processing ../../../data-set_pre_processed/fold-1/articles_train.json...
INFO:root:sanity check : all id % smallest_prime != 0
INFO:root:Sanity check passed.
INFO:root:save file as ../../../data-set_pre_processed/fold-1/articles_train.json.new.json.
INFO:root:saved.


INFO:root:Processing ../../../data-set_pre_processed/fold-2/articles_train.json...
INFO:root:sanity check : all id % smallest_prime != 0
INFO:root:Sanity check passed.
INFO:root:save file as ../../../data-set_pre_processed/fold-2/articles_train.json.new.json.
INFO:root:saved.


INFO:root:Processing ../../../data-set_pre_processed/fold-3/articles_train.json...
INFO:root:sanity check : all id % smallest_prime != 0
INFO:root:Sanity check pas

In [197]:
df["id"].apply(int)[0]

False

In [205]:
str(ids[0]

SyntaxError: unexpected EOF while parsing (<ipython-input-205-4684b922c7ce>, line 1)