In [1]:
import pickle
import pandas as pd
import numpy as np
from zipfile import ZipFile
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with ZipFile('../data/train.csv.zip') as z:
    with z.open('train.csv') as f:
        train_df = pd.read_csv(f)

In [3]:
with ZipFile('../data/test.csv.zip') as z:
    with z.open('test.csv') as f:
        test_df = pd.read_csv(f)

In [4]:
joint_df = pd.concat((train_df, test_df))

In [5]:
train_len = len(train_df)
test_len = len(test_df)

In [6]:
del train_df, test_df

In [8]:
joint_df.columns

Index(['activation_date', 'category_name', 'city', 'deal_probability',
       'description', 'image', 'image_top_1', 'item_id', 'item_seq_number',
       'param_1', 'param_2', 'param_3', 'parent_category_name', 'price',
       'region', 'title', 'user_id', 'user_type'],
      dtype='object')

In [32]:
def export_num_feature(feature, feature_name):
    with open('./num_features/train/{}.npz'.format(feature_name), 'wb') as f:
        save_npz(f, feature[:train_len])
    with open('./num_features/test/{}.npz'.format(feature_name), 'wb') as f:
        save_npz(f, feature[train_len:])

In [10]:
joint_df['region_city'] = joint_df['region'] + '_' + joint_df['city']

In [11]:
del joint_df['region'], joint_df['city']

In [28]:
from scipy.sparse import load_npz, save_npz, csr_matrix, hstack

In [25]:
categorical_feature_names = ['category_name', 'region_city', 'image_top_1', 'param_1', 'param_2',
                        'parent_category_name', 'user_type']

In [26]:
categorical_features = []
for feature_name in categorical_feature_names:
    print(feature_name)
    categorical_features.append(csr_matrix(pd.get_dummies(joint_df[feature_name], dummy_na=True).as_matrix()))
    print('Done')
    

category_name
Done
region_city
Done
image_top_1
Done
param_1
Done
param_2
Done
parent_category_name
Done
user_type
Done


In [29]:
categorical_features = hstack(categorical_features)

In [35]:
categorical_features = categorical_features.tocsr()

In [36]:
categorical_features

<2011862x5601 sparse matrix of type '<class 'numpy.uint8'>'
	with 14083034 stored elements in Compressed Sparse Row format>

In [37]:
export_num_feature(categorical_features, 'categorical_one_hot')

In [38]:
del categorical_features