In [1]:
import shutil
import struct
from collections import defaultdict
from pathlib import Path

import lmdb
import numpy as np
import torch.utils.data
from tqdm import tqdm

import pickle
import os.path

In [2]:
def __get_feat_mapper(path):
    feat_cnts = defaultdict(lambda: defaultdict(int))
    with open(path) as f:
        f.readline()
        pbar = tqdm(f, mininterval=1, smoothing=0.1)
        pbar.set_description('Create tpmn dataset cache: counting features')
        for line in pbar:
            values = line.rstrip('\n').split(',')
            if len(values) != self.NUM_FEATS + 1: 
                continue
            for i in range(1, self.NUM_FEATS + 1): 
                feat_cnts[i][values[i]] += 1 

    # 여기서 random 발생하는듯..?
    feat_mapper = {i: {feat for feat, c in cnt.items() if c >= self.min_threshold} for i, cnt in feat_cnts.items()} 
    feat_mapper = {i: {feat: idx for idx, feat in enumerate(cnt)} for i, cnt in feat_mapper.items()}
    defaults = {i: len(cnt) for i, cnt in feat_mapper.items()}

    return feat_mapper, defaults # key feature num, value feature value + count

In [3]:
path = '../../dataset/tpmn_june_10_sample'

In [4]:
NUM_FEATS = 31

feat_cnts = defaultdict(lambda: defaultdict(int))
with open(path) as f:
    f.readline()
    pbar = tqdm(f, mininterval=1, smoothing=0.1)
    pbar.set_description('Create tpmn dataset cache: counting features')
    for line in pbar:
        values = line.rstrip('\n').split(',')
        if len(values) != NUM_FEATS + 1: 
            continue
        for i in range(1, NUM_FEATS + 1): 
            feat_cnts[i][values[i]] += 1 

Create tpmn dataset cache: counting features: : 15000000it [01:46, 140925.81it/s]


In [5]:
min_threshold=4
feat_mapper = {i: [feat for feat, c in sorted(cnt.items(), key=lambda item:item[1], reverse=True) if c >= min_threshold] for i, cnt in feat_cnts.items()}
feat_mapper2 = {i: {feat: idx for idx, feat in enumerate(cnt)} for i, cnt in feat_mapper.items()}
defaults = {i: len(cnt) for i, cnt in feat_mapper2.items()}

In [7]:
feat_mapper2

{1: {'320x50': 0,
  'native': 1,
  '320x480': 2,
  '300x250': 3,
  '320x100': 4,
  '728x90': 5,
  '480x320': 6,
  '768x1024': 7,
  '300x50': 8,
  '160x600': 9,
  '300x600': 10},
 2: {'com.cashwalk.cashwalk': 0,
  'com.timespread.Timetable2': 1,
  'com.funple.app.zzal.reward': 2,
  'kr.co.angtalk.one': 3,
  'com.easybrain.sudoku.android': 4,
  'com.lottemembers.android': 5,
  'com.avatye.aos.haru': 6,
  'me.timecash': 7,
  'com.andr.evine.who': 8,
  'com.dydbook.bible': 9,
  'com.easybrain.block.puzzle.games': 10,
  'com.firstscreen.weather': 11,
  'com.freeapp.androidapp': 12,
  'devian.tubemate.v3': 13,
  'com.pixel.art.coloring.color.number': 14,
  'com.BitofGame.MiniGolfRetro': 15,
  'com.designkeyboard.keyboard': 16,
  'com.yourtube.cash': 17,
  'com.interpark.notitome': 18,
  'com.demo.haru.music': 19,
  'net.cashpop.plus': 20,
  'com.wafour.wapiceng': 21,
  'com.wzdworks.themekeyboard': 22,
  'net.digsso': 23,
  'com.ntouch.game.gostop2': 24,
  'com.europosit.pixelcoloring': 25,


In [25]:
feat_mapper

{1: {'160x600': 0,
  '300x250': 1,
  '300x50': 2,
  '320x100': 3,
  '728x90': 4,
  '768x1024': 5,
  '480x320': 6,
  '320x480': 7,
  'native': 8,
  '320x50': 9,
  '300x600': 10},
 2: {'com.merpati.psikotes': 0,
  'cinta.indonesia.al.quran.digital': 1,
  'com.test.inteligencia': 2,
  '337346574': 3,
  'com.gpakorea.anyenglish2': 4,
  'com.pregnancycalc.pregnancy': 5,
  'com.soludens.movieview': 6,
  'com.cootek.smartinputv5.skin.keyboard_theme_bright_purple_dynamic_stars_keyboard': 7,
  'com.eapps.a14th': 8,
  '336435697': 9,
  'com.duitcair.pinjamancepatsyariah': 10,
  'appsforall.videotomp3': 11,
  'com.andromo.dev844351.app974799': 12,
  'com.atulyapps.profilevisitor': 13,
  'com.andromo.dev747430.app975459': 14,
  'handasoft.mobile.fortune.tradition': 15,
  'com.mobadu.BlockyFarmRacing': 16,
  'com.valas2020.kurs': 17,
  'com.cootek.smartinputv5.skin.keyboard_theme_neon_sweet_love_keyboard': 18,
  'com.doctor.robot.rescue.mission': 19,
  'zone.snakewormguide': 20,
  'com.cootek.smart

In [19]:
feat_mapper

{1: {'160x600': 0,
  '300x250': 1,
  '300x50': 2,
  '320x100': 3,
  '728x90': 4,
  '768x1024': 5,
  '480x320': 6,
  '320x480': 7,
  'native': 8,
  '320x50': 9,
  '300x600': 10},
 2: {'com.merpati.psikotes': 0,
  'cinta.indonesia.al.quran.digital': 1,
  'com.test.inteligencia': 2,
  '337346574': 3,
  'com.gpakorea.anyenglish2': 4,
  'com.pregnancycalc.pregnancy': 5,
  'com.soludens.movieview': 6,
  'com.cootek.smartinputv5.skin.keyboard_theme_bright_purple_dynamic_stars_keyboard': 7,
  'com.eapps.a14th': 8,
  '336435697': 9,
  'com.duitcair.pinjamancepatsyariah': 10,
  'appsforall.videotomp3': 11,
  'com.andromo.dev844351.app974799': 12,
  'com.atulyapps.profilevisitor': 13,
  'com.andromo.dev747430.app975459': 14,
  'handasoft.mobile.fortune.tradition': 15,
  'com.mobadu.BlockyFarmRacing': 16,
  'com.valas2020.kurs': 17,
  'com.cootek.smartinputv5.skin.keyboard_theme_neon_sweet_love_keyboard': 18,
  'com.doctor.robot.rescue.mission': 19,
  'zone.snakewormguide': 20,
  'com.cootek.smart