In [1]:
import os
import gc
import time
import json
import pickle
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix
from contextlib import contextmanager
import sys
sys.path.append('../code/utils')
sys.path.append('../code/pipeline')
sys.path.append('../code')
import data_utils as du
import perf_utils as pu
import config

In [None]:
FEATURE_DIR = '../data/split/preliminary_contest_data/byUserFeatureName/'
VOCAB_DIR = '../data/vocabulary/preliminary_contest_data/'


def feature_path(feat_name):
    filename = "userFeature.[featureName='{}'].data".format(feat_name)
    return os.path.join(FEATURE_DIR, filename)


def vocab_path(feat_name="all"):
    if feat_name == "all":
        filename = "userFeature.pkl"
    else:
        filename = "userFeature.[featureName='{}'].pkl".format(feat_name)
    return os.path.join(VOCAB_DIR, filename)


def load_feature(feat_name, **kw):
    sep = kw.pop('sep', '|')
    dtype = kw.pop('dtype', {feat_name: str})
    filepath = feature_path(feat_name)
    return pd.read_csv(filepath, sep=sep, dtype=dtype, **kw)


def load_vocab(feat_name='all'):
    filepath = vocab_path(feat_name)
    return du.load_pickle(filepath)

In [None]:
def tokenizer(string):
    return string.split()


cnt_dir = '../data/nlp_count/preliminary_contest_data/byUserFeatureName/'
tfidf_dir = '../data/nlp_tfidf/preliminary_contest_data/byUserFeatureName/'
os.makedirs(cnt_dir, exist_ok=True)
os.makedirs(tfidf_dir, exist_ok=True)

for i, feat_name in enumerate(config.USER_FEAT_NAMES):
    print("-" * 80)
    print("Processing '{}'...".format(feat_name))
    with pu.profiler("loading and preprocessing"):
        df = load_feature(feat_name)  # pd.DataFrame
        vocab = load_vocab(feat_name)  # list
        docs = df[feat_name]  # pd.Series
        if docs.isnull().sum() > 0:
            fill_value = "[nan]"  # don't use [NaN]; fxxk sklearn
            vocab += [fill_value]
            docs = docs.fillna(fill_value)
        
    with pu.profiler("count vectorizing (ngram=1) [sklearn]"):
        cnt_vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=tokenizer, dtype=np.int8)
        cnt_vec_sk = cnt_vectorizer.fit_transform(docs)
    
    with pu.profiler("saving count vectors"):
        cnt_file = "userFeature.[featureName='{}'].pkl".format(feat_name)
        cnt_path = os.path.join(cnt_dir, cnt_file)
        du.save_pickle((cnt_vectorizer.vocabulary_, cnt_vec_sk), cnt_path) # save mapping as well for further analysis
        if i == 0:
            uid_file = "uid.pkl"
            uid_path = os.path.join(cnt_dir, uid_file)
            du.save_pickle(df['uid'].values, uid_path)  # save uid for further analysis
        del cnt_vec_sk
        del cnt_vectorizer
        gc.collect()

    with pu.profiler("TFIDF vectorizing (ngram=1) transformation"):
        tfidf_vectorizer = TfidfVectorizer(vocabulary=vocab, tokenizer=tokenizer, dtype=np.float32)
        tfidf_vec = tfidf_vectorizer.fit_transform(docs)
        

    with pu.profiler("saving TFIDF vectors"):
        tfidf_file = "userFeature.[featureName='{}'].pkl".format(feat_name)
        tfidf_path = os.path.join(tfidf_dir, tfidf_file)
        du.save_pickle((tfidf_vectorizer.vocabulary_, tfidf_vectorizer.idf_, tfidf_vec), tfidf_path)  # save mapping and idf as well
        if i == 0:
            uid_file = "uid.pkl"
            uid_path = os.path.join(tfidf_dir, uid_file)
            du.save_pickle(df['uid'].values, uid_path)  # save uid for further analysis
        del tfidf_vec
        del tfidf_vectorizer
        gc.collect()

    with pu.profiler("cleaning"):
        del [[docs, df]]
        del vocab
        gc.collect()

--------------------------------------------------------------------------------
Processing 'age'...
[09:38:10] Finish loading and preprocessing. △M: +175.23MB. △T: 2.4 seconds.
[09:38:34] Finish count vectorizing (ngram=1) [sklearn]. △M: +97.93MB. △T: 23.3 seconds.
[09:38:34] Finish saving count vectors. △M: -71.11MB. △T: 0.2 seconds.
[09:38:58] Finish TFIDF vectorizing (ngram=1) transformation. △M: +147.6MB. △T: 24.1 seconds.
[09:38:58] Finish saving TFIDF vectors. △M: -158.35MB. △T: 0.2 seconds.
[09:38:58] Finish cleaning. △M: -174.25MB. △T: 0.1 seconds.
--------------------------------------------------------------------------------
Processing 'gender'...
[09:39:01] Finish loading and preprocessing. △M: +158.39MB. △T: 2.3 seconds.
[09:39:32] Finish count vectorizing (ngram=1) [sklearn]. △M: +114.29MB. △T: 31.4 seconds.
[09:39:32] Finish saving count vectors. △M: -87.13MB. △T: 0.1 seconds.
[09:40:14] Finish TFIDF vectorizing (ngram=1) transformation. △M: +147.1MB. △T: 41.7 seconds.


[11:25:46] Finish saving count vectors. △M: -298.11MB. △T: 0.4 seconds.
[11:26:51] Finish TFIDF vectorizing (ngram=1) transformation. △M: +655.18MB. △T: 1.1 minutes.
[11:26:52] Finish saving TFIDF vectors. △M: -654.94MB. △T: 0.7 seconds.
[11:26:52] Finish cleaning. △M: -1.01GB. △T: 0.5 seconds.
--------------------------------------------------------------------------------
Processing 'topic2'...
[11:27:03] Finish loading and preprocessing. △M: +833.25MB. △T: 10.9 seconds.
[11:28:09] Finish count vectorizing (ngram=1) [sklearn]. △M: +318.19MB. △T: 1.1 minutes.
[11:28:09] Finish saving count vectors. △M: -306.87MB. △T: 0.4 seconds.
[11:29:21] Finish TFIDF vectorizing (ngram=1) transformation. △M: +691.6MB. △T: 1.2 minutes.
[11:29:21] Finish saving TFIDF vectors. △M: -676.51MB. △T: 0.7 seconds.
[11:29:22] Finish cleaning. △M: -846.39MB. △T: 0.4 seconds.
--------------------------------------------------------------------------------
Processing 'topic3'...
[11:29:25] Finish loading and pr