In [1]:
!kaggle competitions download -f train.csv.zip --path ./data
!kaggle competitions download -f test.csv.zip --path ./data

Using competition: avito-demand-prediction
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
Using competition: avito-demand-prediction
test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
import pandas as pd
import numpy as np
import os
import dotenv
from tqdm import tqdm 

dotenv.load_dotenv('.env')

True

In [3]:
types = ['train', 'test']
dfs = {t:pd.read_csv('./data/%s.csv.zip'%t) for t in types}

In [5]:
dfs['train'].image[0]

'd10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c7679f17c333c959b19'

In [5]:
def deal_cat(deal_probability):
  if deal_probability == 0.:
    return 0
  elif deal_probability < 0.6:
    return 1
  elif deal_probability < 0.9:
    return 2
  else:
    return 3
dfs['train']['deal_cat'] = dfs['train']['deal_probability'].apply(deal_cat)

In [6]:
for key in dfs:
  dfs[key].image_top_1 = dfs[key].image_top_1.fillna(dfs[key].image_top_1.max()+1).astype(int)

In [7]:
# expand date columns
for col in ['activation_date']:
  for t in types:
    dfs[t][col] = pd.to_datetime(dfs[t][col])
    dfs[t][col+'_weekday'] = dfs[t][col].dt.weekday
    dfs[t][col+'_year'] = dfs[t][col].dt.year
    dfs[t][col+'_month'] = dfs[t][col].dt.month
    dfs[t][col+'_day'] = dfs[t][col].dt.day

In [8]:
from pandas.api.types import CategoricalDtype

# Category to code transform, TODO: use sklearn.preprocessing.LabelEncoder insteads, only fit with training data
for cat_col in ['item_id', 'region', 'city', 'parent_category_name', 'category_name', 'user_type', 'param_1', 'param_2', 'param_3', 'activation_date_month', 'activation_date_day', 'activation_date_weekday']:
  print(cat_col)
  categories = pd.Series()
  for t in types:
    categories = categories.append(dfs[t][cat_col])
  categories = categories.fillna('N/A').unique()
  
  for t in types:
    dfs[t][cat_col+'_code'] = dfs[t][cat_col].fillna('N/A').astype(CategoricalDtype(categories)).cat.codes

item_id
region
city
parent_category_name
category_name
user_type
param_1
param_2
param_3
activation_date_month
activation_date_day
activation_date_weekday


In [9]:
# Default category for long tail category

for col in ['user_id', 'item_seq_number']:
  categories = pd.Series()
  for t in types:
    categories = categories.append(dfs[t][col])
  categories = categories.fillna('N/A')
  
  counts = categories.groupby(categories).size()
  print(sum(counts >= 2), sum(counts >= 3), sum(counts >= 4), len(counts))
  
  categories = counts[counts >= 2].index
  
  for t in types:
    dfs[t][col+'_code'] = dfs[t][col].fillna('N/A').astype(CategoricalDtype(categories)).cat.codes+1 # +1 for keras embedding

322803 166410 102006 1009909
14955 10021 7660 33947


In [10]:
# TODO: pull request for sklearn standard scaler to ignore nan
# Now we just scale across training and testing data indepepndently WITHOUT using the same mean and std
price = np.log(dfs['train']['price']+1)
masked_price = np.ma.array(price, mask=np.isnan(price))
mean_price = np.mean(masked_price)
std_price = np.std(masked_price)

for t in types:
  dfs[t]['price_std'] = (np.log(dfs[t]['price']+1) - mean_price) / std_price
  dfs[t]['price_isnull'] = dfs[t]['price_std'].isnull().astype(float)
  dfs[t]['price_std'] = dfs[t]['price_std'].fillna(0.)

In [11]:
for col in ['title', 'description']:
  for t in types:
    dfs[t][col+'_length'] = dfs[t][col].fillna('').apply(len)
    dfs[t][col+'_space_count'] = dfs[t][col].fillna('').apply(lambda x: len(x.split(' ')))    

In [12]:
for key in dfs:
  path = './data/%s_prep.snappy.parquet'%key
  dfs[key].to_parquet(path)

!gsutil rsync data gs://{os.environ['GCP_BUCKET']}/data

Building synchronization state...
Starting synchronization...
Copying file://data/test_prep.snappy.parquet [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://data/train_prep.snappy.parquet [Content-Type=application/octet-stream]...
|
Operation completed over 2 objects/629.5 MiB.                                    


In [13]:
dfs['train']

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,activation_date_day_code,activation_date_weekday_code,user_id_code,item_seq_number_code,price_std,price_isnull,title_length,title_space_count,description_length,description_space_count
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),...,0,0,0,2,-0.687649,0.0,21,3,58,7
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,...,1,1,0,19,0.054935,0.0,17,3,41,7
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,...,2,2,184005,9,0.161042,0.0,14,2,99,16
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,...,3,3,241304,286,-0.059450,0.0,10,1,22,3
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110,"ВАЗ 2110, 2003",...,4,4,0,3,1.010479,0.0,14,3,24,4
5,51e0962387f7,bbfad0b1ad0a,Татарстан,Чистополь,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Авто люлька,...,0,0,0,9,-0.253431,0.0,11,2,19,3
6,c4f260a2b48a,08f469d2e6f7,Нижегородская область,Нижний Новгород,Для дома и дачи,Ремонт и строительство,Сантехника и сауна,,,Водонагреватель 100 литров нержавейка плоский,...,5,4,11290,125,0.534205,0.0,45,5,141,21
7,6b71309d6a8a,fef86baa002c,Пермский край,Пермь,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Джинсы,26,Бойфренды colins,...,3,3,321579,61,-0.605506,0.0,16,2,30,4
8,c5b969cb63a2,055825270190,Оренбургская область,Оренбург,Личные вещи,"Одежда, обувь, аксессуары",Женская одежда,Платья и юбки,> 50 (XXL),Платье,...,6,5,6690,85,-0.605506,0.0,6,1,24,5
9,b1570962e68c,f9e8f831d94c,Нижегородская область,Нижний Новгород,Личные вещи,Детская одежда и обувь,Для девочек,Обувь,25,Полу ботиночки замш натур.Бамбини,...,7,6,315093,136,-0.687649,0.0,33,4,65,11
