In [1]:
import os
import gc
import time
import json
import pickle
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix
from contextlib import contextmanager
import sys
sys.path.append('../../../code/utils')
from perf_utils import get_memory_str, get_memory_bytes, format_memory_diff, format_secs

In [2]:
FEATURE_DIR = '../../../data/split/preliminary_contest_data/byUserFeatureName/'
VOCAB_DIR = '../../../data/vocabulary/preliminary_contest_data/'


def feature_path(feat_name):
    filename = "userFeature.[featureName='{}'].data".format(feat_name)
    return os.path.join(FEATURE_DIR, filename)


def vocab_path(feat_name="all"):
    if feat_name == "all":
        filename = "userFeature.pkl"
    else:
        filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    return os.path.join(VOCAB_DIR, filename)

In [3]:
def load_pickle(filepath):
    obj = None
    with open(filepath, "rb") as f:
        obj = pickle.load(f)
    return obj


def save_as_pickle(obj, filepath):
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

In [4]:
def load_feature(feat_name, **kw):
    sep = kw.pop('sep', '|')
    dtype = kw.pop('dtype', {feat_name: str})
    filepath = feature_path(feat_name)
    return pd.read_csv(filepath, sep=sep, dtype=dtype, **kw)

def load_vocab(feat_name='all'):
    filepath = vocab_path(feat_name)
    return load_pickle(filepath)

In [5]:
def get_time_str():
    return time.strftime("%H:%M:%S", time.gmtime())

In [6]:
@contextmanager
def profiler(task_name, verbose_memory=True, verbose_time=True):
    t0 = time.time()
    m0 = get_memory_bytes()
    yield
    t_delta = time.time() - t0
    m_delta = get_memory_bytes() - m0
    msg = "[{}] Finish {}.".format(get_time_str(), task_name)
    if verbose_memory:
        msg += " △M: {}.".format(format_memory_diff(m_delta))
    if verbose_time:
        msg += " △T: {}.".format(format_secs(t_delta))
    print(msg)

In [7]:
def count_vectorize(series, vocab):
    # My implementation of CountVectorizer FOR THIS CASE ONLY
    # see https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
    # for the standard CSR representation.
    # It's faster since:
    # 1. There is no checking in each iteration, because I assume that the input text and vocabulary matches
    # 2. Technically I didn't use iteration, I use pandas apply instead. That's why the input text must be pd.Series
    # 3. I use library like `itertools` instead of starting from scratch by myself
    # 4. It will be much faster theoretically if I add multiprocessing
    vocab_map = {val: i for i, val in enumerate(vocab)}  # mapping word(str) to column index(int)
    lst_series = series.apply(lambda x: [vocab_map[val] for val in x.split()])  # pd.Series with each row: list of int
    cnt_series = lst_series.apply(len)  # pd.Series with each row: int, indicating the number of words
    indptr = np.concatenate((np.zeros(1), np.add.accumulate(cnt_series)))  # there should be a zeros at the beginning
    indices = list(itertools.chain.from_iterable(lst_series))  # to concatenate lists from all rows
    data = np.ones(len(indices), dtype=np.int8)  # all non-zero value is one.
    cnt_vec = csr_matrix((data, indices, indptr), dtype=np.int8)  # see the link above for detailed explanation
    
    # clean memory. I am not sure whether they work. to be checked
    vocab_map.clear()
    del [[lst_series, cnt_series]]
    del indptr
    del indices
    del data
    gc.collect()
    lst_series = pd.DataFrame()  # not sure whether it work
    cnt_series = pd.DataFrame()  # same

    return cnt_vec, vocab_map

In [8]:
cnt_dir = '../../../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
tfidf_dir = '../../../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'
os.makedirs(cnt_dir, exist_ok=True)
os.makedirs(tfidf_dir, exist_ok=True)

In [9]:
def tokenizer(string):
    return string.split()


feat_names = ["age", "gender", "marriageStatus", "education", "consumptionAbility", "LBS",
              "interest1", "interest2", "interest3", "interest4", "interest5",
              "kw1", "kw2", "kw3", "topic1", "topic2", "topic3", "appIdInstall",
              "appIdAction", "ct", "os", "carrier", "house"]

print("Memory usage at this moment: {}".format(get_memory_str()))
for i, feat_name in enumerate(feat_names):
    print("-" * 80)
    print("Processing '{}'...".format(feat_name))
    with profiler("loading and preprocessing"):
        df = load_feature(feat_name)  # pd.DataFrame
        vocab = load_vocab(feat_name)  # list
        docs = df[feat_name]  # pd.Series
        if docs.isnull().sum() > 0:
            fill_value = "[nan]"  # don't use [NaN]; fxxk sklearn
            vocab += [fill_value]
            docs = docs.fillna(fill_value)
        
    with profiler("count vectorizing (ngram=1) [sklearn]"):
        cnt_vectorizer = CountVectorizer(vocabulary=vocab, 
                                         tokenizer=tokenizer,
                                         dtype=np.int8)
        cnt_vec_sk = cnt_vectorizer.fit_transform(docs)
        checksum_sk = cnt_vec_sk.getnnz()
        
    with profiler("count vectorizing (ngram=1) [Janzen's]"):
        # My method is about 60% faster than sklearn (though it is only applicable for this case)
        # Just forget the memory usage, Jupyter's memory mechanism sucks. : )
        # There must be some dirty py deal between sklearn/scipy/numpy/pandas and Jupyter, fxxk them all
        # I am faster and I haven't add multiprocessing and cython yet. : )
        # I believe if I add 4-way multiprocessing, the speed will be at least doubled compared with this version
        # Fxxk Jupyter Notebook
        cnt_vec_jz, cnt_dict = count_vectorize(docs, vocab)
        checksum_jz = cnt_vec_jz.getnnz()
        
    assert checksum_jz == checksum_sk
    
    with profiler("saving count vectors", verbose_time=False):
        cnt_file = "userFeature.[featureName='{}'].pkl".format(feat_name)
        cnt_path = os.path.join(cnt_dir, cnt_file)
        # save_as_pickle((cnt_dict, cnt_vec_jz), cnt_path)
        save_as_pickle((cnt_vectorizer.vocabulary_, cnt_vec_sk), cnt_path) # save mapping as well for further analysis
        if i == 0:
            uid_file = "uid.pkl"
            uid_path = os.path.join(cnt_dir, uid_file)
            save_as_pickle(df['uid'].values, uid_path)  # save uid for further analysis
        del cnt_vec_jz
        del cnt_vec_sk
        del cnt_dict
        del cnt_vectorizer
        gc.collect()

    with profiler("TFIDF vectorizing (ngram=1) transformation"):
        tfidf_vectorizer = TfidfVectorizer(vocabulary=vocab, 
                                           tokenizer=tokenizer,
                                           dtype=np.float32)
        tfidf_vec = tfidf_vectorizer.fit_transform(docs)
        

    with profiler("saving TFIDF vectors", verbose_time=False):
        tfidf_file = "userFeature.[featureName='{}'].pkl".format(feat_name)
        tfidf_path = os.path.join(tfidf_dir, tfidf_file)
        save_as_pickle((tfidf_vectorizer.vocabulary_, tfidf_vectorizer.idf_, tfidf_vec), tfidf_path)  # save mapping and idf as well
        if i == 0:
            uid_file = "uid.pkl"
            uid_path = os.path.join(tfidf_dir, uid_file)
            save_as_pickle(df['uid'].values, uid_path)  # save uid for further analysis
        del tfidf_vec
        del tfidf_vectorizer
        gc.collect()

    with profiler("cleaning", verbose_time=False):
        del [[docs, df]]
        del vocab
        gc.collect()

Memory usage at this moment: 105.88MB
--------------------------------------------------------------------------------
Processing 'age'...
[16:27:58] Finish loading and preprocessing. △M: +148.58MB. △T: 2.2 seconds.
[16:28:22] Finish count vectorizing (ngram=1) [sklearn]. △M: +83.95MB. △T: 24.8 seconds.
[16:28:35] Finish count vectorizing (ngram=1) [Janzen's]. △M: +157.99MB. △T: 12.5 seconds.
[16:28:35] Finish saving count vectors. △M: -240.17MB.
[16:29:00] Finish TFIDF vectorizing (ngram=1) transformation. △M: +148.05MB. △T: 24.9 seconds.
[16:29:01] Finish saving TFIDF vectors. △M: -147.8MB.
[16:29:01] Finish cleaning. △M: -147.81MB.
--------------------------------------------------------------------------------
Processing 'gender'...
[16:29:03] Finish loading and preprocessing. △M: +146.83MB. △T: 2.0 seconds.
[16:29:28] Finish count vectorizing (ngram=1) [sklearn]. △M: +83.16MB. △T: 24.6 seconds.
[16:29:40] Finish count vectorizing (ngram=1) [Janzen's]. △M: +158.05MB. △T: 12.4 secon

[16:55:04] Finish count vectorizing (ngram=1) [Janzen's]. △M: +172.94MB. △T: 16.4 seconds.
[16:55:05] Finish saving count vectors. △M: -267.18MB.
[16:55:36] Finish TFIDF vectorizing (ngram=1) transformation. △M: +171.58MB. △T: 31.4 seconds.
[16:55:37] Finish saving TFIDF vectors. △M: -171.57MB.
[16:55:37] Finish cleaning. △M: -250.97MB.
--------------------------------------------------------------------------------
Processing 'topic1'...
[16:55:47] Finish loading and preprocessing. △M: +878.09MB. △T: 9.9 seconds.
[16:56:35] Finish count vectorizing (ngram=1) [sklearn]. △M: +254.57MB. △T: 48.6 seconds.
[16:57:06] Finish count vectorizing (ngram=1) [Janzen's]. △M: +463.91MB. △T: 30.5 seconds.
[16:57:07] Finish saving count vectors. △M: -716.78MB.
[16:57:53] Finish TFIDF vectorizing (ngram=1) transformation. △M: +555.59MB. △T: 46.2 seconds.
[16:57:55] Finish saving TFIDF vectors. △M: -555.46MB.
[16:57:55] Finish cleaning. △M: -876.47MB.
---------------------------------------------------

In [10]:
gc.collect()
print("Memory usage at this moment: {}".format(get_memory_str()))

Memory usage at this moment: 81.35MB
