In [5]:
import gzip
import pandas as pd
import json
import sys
import re
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.externals import joblib

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input
from keras.utils import np_utils

import numpy as np

# set several constants for dealing with large amounts of data
batch_size = 500000
vector_size = 200
hidden_layer_size = 512

# set up dimensionality reduction for large word vectors
svd = TruncatedSVD(vector_size)
normalizer = Normalizer(copy=False)
pipe_it_up = make_pipeline(svd, normalizer)

In [6]:
# set up supporting functions

# preprocess sentences and remove noise making words
def preprocess(s, tokenize_words=False):
    s = s.lower()
    s = s.replace('-', ' ')
    s = ''.join(x for x in s if x not in [',', '®', ':', '+', '%', '#'])
    s = s.replace('mm', ' mm')
    s = re.sub(r'\([^)]*\)', '', s)
    s = re.sub(r'\[[^)]*\]', '', s)
    s = s.replace('/', ' ')
    s = s.replace('gold edition', 'gold_edition')
    s = s.replace('premium edition', 'premium_edition')
    s = s.replace('standard edition', 'standard_edition')
    s = s.replace('feet', ' feet')
    s = s.strip()
    if tokenize_words:
        stoplist = ['&', 'a', 'and', 'the', 'for', 'of', 'to', 'in', 'into']
        s = [word for word in s.split() if word not in stoplist]
        s = word_tokenize(' '.join(s))
    return s

def normalize_in_list(row, type='cat'):
    tmp = []
    for x in row:
        if type == 'cat':
            tmp.append(preprocess(x, tokenize_words=False))
        elif type == 'item':
            tmp.append(preprocess(x, tokenize_words=True))
    return tmp

def flatten_lists(l):
    return [item for sublist in l for item in sublist]

In [7]:
# set up neural network to predict classifications
def functional_model(input_shape, aux_shape, output_shape):
    name_input = Input(shape=(input_shape,), name='name_input')
    aux_input = Input(shape=(input_shape,), name='aux_input')
    x = merge([name_input, aux_input], mode='concat')
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(output_shape, activation='softmax')(x)
    model = Model(input=[name_input, aux_input], output=output)
    return model

def create_model(input_shape, output_shape):
    model = Sequential()
    model.add(Dense(hidden_layer_size, input_dim=input_shape, activation='relu'))
    model.add(Dense(output_shape, activation='relu'))
    return model
    
def train_model(model, n_version):
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(input_data, expected, batch_size=128, shuffle=True,
              nb_epoch=50,  verbose=1, 
              validation_split=0.25)
    model.save('./models/categoy_predictor' + str(version) + '.h5')

# fill in empty data
def fill_data(product_dict):
    fields = {'categories': [],
              'salesRank': {},
              'title': '',
              'related': {},
              'brand': ''}
    if 'categories' in product_dict.keys():
        product_dict['categories'] = product_dict['categories'][0]
    
    missing_keys = [key for key in fields.keys() if key not in product_dict.keys()]
    
    if 'salesRank' not in missing_keys and 'categories' in missing_keys:
        product_dict['categories'] = product_dict['salesRank'].keys()
        missing_keys.remove('categories')
    
    for key in missing_keys:
        product_dict[key] = fields[key]
    
    return product_dict

# convert batch of json lines into a pandas dataframe to work with
def to_df(first_line, f, batch_size):
    df = {}
    line = first_line
    index = 0
    
    while (index < batch_size and line):
        if ('{' in line):
            if line[-1] == ',':
                line = line[:-1]
            df[index] = fill_data(json.loads(line))
            index += 1
            
        line = f.readline().decode().strip()
     
    # make into a pandas dataframe
    df = pd.DataFrame.from_dict(df, orient='index')
    df.drop('imUrl', 1)
    df.drop('price', 1)
    df['title_n'] = df['title'].apply(lambda x: ' '.join(normalize_in_list([x], type='item')[0]))
    
    return (df, line)    

# embed a new batch of product names
def embed(vectorizer, df):
    X = vectorizer.transform(df.title_n)
    return pipe_it_up.fit_transform(X).astype('float32')

In [8]:
# GET VOCABULARY FOR PRODUCT NAMES
classes = set([])
titles = []
brands = set([])

with gzip.open('productMeta.txt.gz', 'rb') as f:
    index = 0
    line = f.readline().decode().strip()
    while line:
        if ('{' in line):
            if (line[-1] == ','):
                line = line[:-1]
            product_json = json.loads(line)
            # get titles
#             if 'title' in product_json.keys():
#                 titles.append(product_json['title'])
            
            # get categories
            if 'categories' in product_json.keys():
                classes = classes.union(set(product_json['categories'][0]))
            elif 'salesRank' in product_json.keys():
                classes = classes.union(set(product_json['salesRank'].keys()))
                    
            # brands
            if 'brand' in product_json.keys():
                brands = brands.union(set([product_json['brand']]))
            
            index += 1
            
        line = f.readline().decode().strip()

        
##########
# titles_map = map(lambda x: ' '.join(normalize_in_list([x], type='item')[0]), titles)

# vectorizer = TfidfVectorizer(max_df=0.5, 
#                              max_features=None, 
#                              stop_words='english', 
#                              use_idf=True)
# vectorizer.fit(titles_map)


KeyboardInterrupt: 

In [6]:
index = 0
classes = list(classes)
brands = list(brands)

with gzip.open('productMeta.txt.gz', 'rb') as f:
    line = f.readline().decode().strip()
    while line:
        # use 250,000 as the batch size
        df, line = to_df(line, f, batch_size)
        
        X_aux = np.zeros((len(df.index), len(brands)), dtype=bool)
        for i, brand in enumberate(df.brand):
            X_aux[i, brands.index(brand)] = True
        
        X = embed(vectorizer, df)
        Y = np.zeros((len(df.index), len(classes)), dtype=bool)
        for i, cats in enumerate(df.categories):
            for cat in cats:
                Y[i, classes.index(cat)] = True
        index += 1
        
        model = functional_model(X.shape[1], X_aux.shape[1], Y.shape[1])
        #model = create_model(X.shape[1], Y.shape[1])
        train_model(model, index)

ValueError: "Children's Music" is not in list

In [4]:
#joblib.dump(vectorizer, 'vectorizer.pkl') 

# load classes
classes = []
with open('categories.txt', 'r') as f:
    classes = f.read().split('|')
vectorizer = joblib.load('vectorizer.pkl')

In [None]:
with open('categories.txt', 'r') as f:
    f.write('|'.join(classes))

In [9]:
# brands embeddings
print(len(brands))
print(brands)

81795


In [35]:
df.head(3)

Unnamed: 0,categories,imUrl,salesRank,title,asin,related,price,brand
0,[Books],http://ecx.images-amazon.com/images/I/51MKP0T4...,{'Books': 6334800},"The Crucible: Performed by Stuart Pankin, Jero...",1048791,,,
1,"[Movies & TV, Movies]",http://g-ecx.images-amazon.com/images/G/01/x-s...,{'Movies & TV': 376041},"Everyday Italian (with Giada de Laurentiis), V...",143561,"{'buy_after_viewing': ['B0036FO6SI', 'B000KL8O...",12.99,
2,[Amazon Fashion],http://ecx.images-amazon.com/images/I/31mCncNu...,{'Clothing': 1233557},Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...,37214,"{'also_viewed': ['B00JO8II76', 'B00DGN4R1Q', '...",6.99,Big Dreams
