In [37]:
import pandas as pd
import numpy as np
import os
import dotenv
import gc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import stop_words
from scipy import sparse

dotenv.load_dotenv('.env')
types = ['train', 'test']
cols = ['title', 'description']
LSA_SIZE = 32

In [2]:
for key in types:
  path = './data/%s_prep.snappy.parquet'%key
  !gsutil cp gs://{os.environ['GCP_BUCKET']}/{key}_prep.snappy.parquet {path}

Copying gs://kaggle-195720-avito-demand-prediction/train_prep.snappy.parquet...
/ [1 files][465.7 MiB/465.7 MiB]                                                
Operation completed over 1 objects/465.7 MiB.                                    
Copying gs://kaggle-195720-avito-demand-prediction/test_prep.snappy.parquet...
\ [1 files][163.8 MiB/163.8 MiB]                                                
Operation completed over 1 objects/163.8 MiB.                                    


In [3]:
lsa = Pipeline([('tfidf', TfidfVectorizer(max_features=20000, ngram_range=(1,2))), ('compress', TruncatedSVD(n_components=LSA_SIZE))])
train = pd.read_parquet('./data/train_prep.snappy.parquet', columns=['description']).fillna('N/A')
lsa.fit(train.description)
del train
gc.collect()

7

In [4]:
for col in cols:
  for t in types:
    df = pd.read_parquet('./data/%s_prep.snappy.parquet'%t, columns=[col]).fillna('N/A')
    df = pd.DataFrame(lsa.transform(df[col]), columns=['%s_lsa_%d'%(col,i) for i in range(LSA_SIZE)])
    df.to_parquet('./data/%s_%s_lsa.snappy.parquet'%(t,col))
    !gsutil cp ./data/{t}_{col}_lsa.snappy.parquet gs://{os.environ['GCP_BUCKET']}/{t}_{col}_lsa.snappy.parquet
    del df
    gc.collect()

Copying file://./data/train_title_lsa.snappy.parquet [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/286.1 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

|
Operation completed over 1 objects/286.1 MiB.                                    
Copying file://./data/test_title_lsa.snappy.parquet

In [31]:
# TODO: build tfidf vectorizer with supp data, pick min_df, max_df by examining distribution
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=300, max_df=0.95)
train = pd.read_parquet('./data/train_prep.snappy.parquet', columns=['description']).fillna('N/A')
vectorizer.fit(train.description)

print(len(vectorizer.get_feature_names()))
del train
gc.collect()

7

In [38]:
for col in cols:
  for t in types:
    df = pd.read_parquet('./data/%s_prep.snappy.parquet'%t, columns=[col]).fillna('N/A')
    path = 'data/%s_%s_tfidf.npz'%(t,col)
    sparse.save_npz(path, vectorizer.transform(df[col]))
    !gsutil cp {path} gs://{os.environ['GCP_BUCKET']}/{path}
    del df
    gc.collect()

Copying file://data/train_title_tfidf.npz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/33.8 MiB.                                     
Copying file://data/test_title_tfidf.npz [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/11.3 MiB.                                     
Copying file://data/train_description_tfidf.npz [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, co

In [40]:
import pickle
pickle.dump(vectorizer.get_feature_names(), open('./data/tfidf_feature_names.pl', 'wb'))

In [41]:
!ls -laht data

total 1.9G
-rw-r--r-- 1 root root 530K Jun 23 04:12 tfidf_feature_names.pl
drwxr-xr-x 2 root root 4.0K Jun 23 04:12 .
-rw-r--r-- 1 root root 107M Jun 23 03:47 test_description_tfidf.npz
-rw-r--r-- 1 root root 296M Jun 23 03:46 train_description_tfidf.npz
-rw-r--r-- 1 root root  12M Jun 23 03:44 test_title_tfidf.npz
-rw-r--r-- 1 root root  34M Jun 23 03:44 train_title_tfidf.npz
-rw-r--r-- 1 root root 134M Jun 23 02:54 test_description_lsa.snappy.parquet
-rw-r--r-- 1 root root 359M Jun 23 02:53 train_description_lsa.snappy.parquet
-rw-r--r-- 1 root root  84M Jun 23 02:52 test_title_lsa.snappy.parquet
-rw-r--r-- 1 root root 287M Jun 23 02:51 train_title_lsa.snappy.parquet
-rw-r--r-- 1 root root 164M Jun 23 02:47 test_prep.snappy.parquet
-rw-r--r-- 1 root root 466M Jun 23 02:47 train_prep.snappy.parquet
drwxr-xr-x 1 root root 4.0K Jun 23 02:47 ..


In [42]:
!gsutil rsync data gs://{os.environ['GCP_BUCKET']}/data

Building synchronization state...
Starting synchronization...
Copying file://data/test_description_lsa.snappy.parquet [Content-Type=application/octet-stream]...
Computing MD5 for file://data/test_description_tfidf.npz...
Copying mtime from src to dst for gs://kaggle-195720-avito-demand-prediction/data/test_description_tfidf.npz
Copying file://data/test_prep.snappy.parquet [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing ch