[View in Colaboratory](https://colab.research.google.com/github/Hoiy/kaggle-avito-demand-prediction/blob/master/prep.ipynb)

In [1]:
!kaggle competitions download -f train.csv.zip --path ./data
!kaggle competitions download -f test.csv.zip --path ./data

Using competition: avito-demand-prediction
train.csv.zip: Downloaded 308MB of 308MB to ./data
Using competition: avito-demand-prediction
test.csv.zip: Downloaded 107MB of 107MB to ./data


In [2]:
import pandas as pd
import numpy as np
import os
import dotenv
import fastText
from tqdm import tqdm 

dotenv.load_dotenv('.env')

True

In [0]:
types = ['train', 'test']
dfs = {t:pd.read_csv('./data/%s.csv.zip'%t) for t in types}

In [4]:
dfs['train'].head()

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,description,price,item_seq_number,activation_date,user_type,image,image_top_1,deal_probability
0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,Кокоби(кокон для сна),"Кокон для сна малыша,пользовались меньше месяц...",400.0,2,2017-03-28,Private,d10c7e016e03247a3bf2d13348fe959fe6f436c1caf64c...,1008.0,0.12789
1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,Стойка для Одежды,"Стойка для одежды, под вешалки. С бутика.",3000.0,19,2017-03-26,Private,79c9392cc51a9c81c6eb91eceb8e552171db39d7142700...,692.0,0.0
2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,Philips bluray,"В хорошем состоянии, домашний кинотеатр с blu ...",4000.0,9,2017-03-20,Private,b7f250ee3f39e1fedd77c141f273703f4a9be59db4b48a...,3032.0,0.43177
3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,Автокресло,Продам кресло от0-25кг,2200.0,286,2017-03-25,Company,e6ef97e0725637ea84e3d203e82dadb43ed3cc0a1c8413...,796.0,0.80323
4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,"ВАЗ 2110, 2003",Все вопросы по телефону.,40000.0,3,2017-03-16,Private,54a687a3a0fc1d68aed99bdaaf551c5c70b761b16fd0a2...,2264.0,0.20797


In [0]:
def deal_cat(deal_probability):
  if deal_probability == 0.:
    return 0
  elif deal_probability < 0.6:
    return 1
  elif deal_probability < 0.9:
    return 2
  else:
    return 3
dfs['train']['deal_cat'] = dfs['train']['deal_probability'].apply(deal_cat)

In [0]:
for key in dfs:
  dfs[key].image_top_1 = dfs[key].image_top_1.fillna(dfs[key].image_top_1.max()+1).astype(int)

In [0]:
# expand date columns
for col in ['activation_date']:
  for t in types:
    dfs[t][col] = pd.to_datetime(dfs[t][col])
    dfs[t][col+'_weekday'] = dfs[t][col].dt.weekday
    dfs[t][col+'_year'] = dfs[t][col].dt.year
    dfs[t][col+'_month'] = dfs[t][col].dt.month
    dfs[t][col+'_day'] = dfs[t][col].dt.day

In [8]:
from pandas.api.types import CategoricalDtype

# Category to code transform, TODO: use sklearn.preprocessing.LabelEncoder insteads, only fit with training data
for cat_col in ['item_id', 'region', 'city', 'parent_category_name', 'category_name', 'user_type', 'param_1', 'param_2', 'param_3', 'activation_date_month', 'activation_date_day', 'activation_date_weekday']:
  print(cat_col)
  categories = pd.Series()
  for t in types:
    categories = categories.append(dfs[t][cat_col])
  categories = categories.fillna('N/A').unique()
  
  for t in types:
    dfs[t][cat_col+'_code'] = dfs[t][cat_col].fillna('N/A').astype(CategoricalDtype(categories)).cat.codes

item_id
region
city
parent_category_name
category_name
user_type
param_1
param_2
param_3
activation_date_month
activation_date_day
activation_date_weekday


In [9]:
# Default category for long tail category

for col in ['user_id', 'item_seq_number']:
  categories = pd.Series()
  for t in types:
    categories = categories.append(dfs[t][col])
  categories = categories.fillna('N/A')
  
  counts = categories.groupby(categories).size()
  print(sum(counts >= 2), sum(counts >= 3), sum(counts >= 4), len(counts))
  
  categories = counts[counts >= 2].index
  
  for t in types:
    dfs[t][col+'_code'] = dfs[t][col].fillna('N/A').astype(CategoricalDtype(categories)).cat.codes+1 # +1 for keras embedding

322803 166410 102006 1009909
14955 10021 7660 33947


In [0]:
# TODO: pull request for sklearn standard scaler to ignore nan
# Now we just scale across training and testing data indepepndently WITHOUT using the same mean and std
from scipy.stats import zscore

price = np.log(dfs['train']['price']+1)
masked_price = np.ma.array(price, mask=np.isnan(price))
mean_price = np.mean(masked_price)
std_price = np.std(masked_price)

for t in types:
  dfs[t]['price_std'] = (np.log(dfs[t]['price']+1) - mean_price) / std_price
  dfs[t]['price_isnull'] = dfs[t]['price_std'].isnull().astype(float)
  dfs[t]['price_std'] = dfs[t]['price_std'].fillna(0.)

In [11]:
for key in dfs:
  path = './data/%s_prep.snappy.parquet'%key
  dfs[key].to_parquet(path)
  !gsutil cp {path} gs://{os.environ['GCP_BUCKET']}/{key}_prep.snappy.parquet

Copying file://./data/train_prep.snappy.parquet [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/459.8 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][459.8 MiB/459.8 MiB]                                                
Operation completed over 1 objects/459.8 MiB.              