In [1]:
import pickle
import pandas as pd
import numpy as np
from zipfile import ZipFile
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with ZipFile('../data/train.csv.zip') as z:
    with z.open('train.csv') as f:
        train_df = pd.read_csv(f)

In [3]:
with ZipFile('../data/test.csv.zip') as z:
    with z.open('test.csv') as f:
        test_df = pd.read_csv(f)

In [15]:
joint_df = pd.concat((train_df, test_df))

In [24]:
train_len = len(train_df)
test_len = len(test_df)

In [6]:
del train_df, test_df

In [11]:
from sklearn.preprocessing import LabelEncoder

In [8]:
joint_df.columns

Index(['activation_date', 'category_name', 'city', 'deal_probability',
       'description', 'image', 'image_top_1', 'item_id', 'item_seq_number',
       'param_1', 'param_2', 'param_3', 'parent_category_name', 'price',
       'region', 'title', 'user_id', 'user_type'],
      dtype='object')

In [21]:
from scipy.sparse import load_npz, save_npz, csr_matrix

# Endcoding base features

## Categorical

In [13]:
def export_encoder(encoder, feature_name):
    with open('./cat_features/{}_encoder.pkl'.format(feature_name), 'wb') as f:
        pickle.dump(encoder, f)

In [14]:
def export_feature(feature, feature_name):
    with open('./cat_features/train/{}.npz'.format(feature_name), 'wb') as f:
        save_npz(f, feature[:train_len])
    with open('./cat_features/test/{}.npz'.format(feature_name), 'wb') as f:
        save_npz(f, feature[train_len:])

In [12]:
categorical = ['category_name', 'city', 'image_top_1', 'param_1', 'param_2', 'param_3', 'parent_category_name', 'region', 'user_type', 'user_id']

In [13]:
for feature_name in categorical:
    encoder = LabelEncoder()
    feature = csr_matrix(encoder.fit_transform(joint_df[feature_name].astype(str))).T
    export_encoder(encoder, feature_name)
    export_feature(feature, feature_name)

## Numerical

In [20]:
def export_num_feature(feature, feature_name):
    with open('./num_features/train/{}.npz'.format(feature_name), 'wb') as f:
        save_npz(f, feature[:train_len])
    with open('./num_features/test/{}.npz'.format(feature_name), 'wb') as f:
        save_npz(f, feature[train_len:])

In [18]:
numerical = ['item_seq_number', 'price']

In [19]:
for feature_name in numerical:
    feature = csr_matrix(joint_df[feature_name]).T
    export_num_feature(feature, feature_name)

# Generating other simple features

## Categorical

### image_is_null

In [20]:
encoder = LabelEncoder()
feature = csr_matrix(encoder.fit_transform(pd.isnull(joint_df['image']).astype(str))).T
export_encoder(encoder, 'image_is_null')
export_feature(feature, 'image_is_null')

## region_city

In [15]:
encoder = LabelEncoder()
feature = joint_df['region'] + '_' + joint_df['city']
feature = csr_matrix(encoder.fit_transform(feature)).T
export_encoder(encoder, 'region_city')
export_feature(feature, 'region_city')

## Numerical

### title_length_chars

In [17]:
feature = csr_matrix(joint_df['title'].str.len()).T
export_num_feature(feature, 'title_length_chars')

### description_length_chars

In [18]:
feature = csr_matrix(joint_df['description'].str.len()).T
export_num_feature(feature, 'description_length_chars')

# Mean encoding

In [13]:
categorical = ['category_name', 'city', 'image_top_1', 'param_1', 'param_2', 'param_3',
               'parent_category_name', 'region', 'user_type']

In [16]:
mean_features = pd.DataFrame()
for c in categorical:
    gp = train_df.groupby(c)['deal_probability']
    mean = gp.mean()
    std  = gp.std()
    mean_features[c + '_deal_probability_avg'] = joint_df[c].map(mean)
    mean_features[c + '_deal_probability_std'] = joint_df[c].map(std)

In [19]:
mean_features = mean_features.fillna(-999)

In [22]:
mean_features = mean_features.as_matrix()

In [23]:
mean_features = csr_matrix(mean_features)

In [25]:
mean_features.shape

(2011862, 18)

In [26]:
export_num_feature(mean_features, 'mean_encoded_categorical')