In [1]:
import gzip
import pandas as pd
import copy
import re

from keras.layers import Input, Dense, Dropout
from keras.models import Model
from nltk.tokenize import word_tokenize
from keras.utils import np_utils

from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

Using Theano backend.


In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def preprocess(s, tokenize_words=False):
    s = s.lower()
    s = s.replace('-', ' ')
    s = ''.join(x for x in s if x not in [',', '®', ':', '+', '%', '#'])
    s = s.replace('mm', ' mm')
    s = re.sub(r'\([^)]*\)', '', s)
    s = re.sub(r'\[[^)]*\]', '', s)
    s = s.replace('/', ' ')
    s = s.replace('gold edition', 'gold_edition')
    s = s.replace('premium edition', 'premium_edition')
    s = s.replace('standard edition', 'standard_edition')
    s = s.replace('feet', ' feet')
    s = s.strip()
    if tokenize_words:
        stoplist = ['&', 'a', 'and', 'the', 'for', 'of', 'to', 'in', 'into']
        s = [word for word in s.split() if word not in stoplist]
        s = word_tokenize(' '.join(s))
    return s

def normalize_in_list(row, type='cat'):
    tmp = []
    for x in row:
        if type == 'cat':
            tmp.append(preprocess(x, tokenize_words=False))
        if type == 'item':
            tmp.append(preprocess(x, tokenize_words=True))
    return tmp

def flatten_lists(l):
    return [item for sublist in l for item in sublist]

In [7]:
df = {}

with open('amazon-meta.txt', 'r') as f:
    line = f.readline()
    asin = ''
    product = {
        'asin': '',
        'title': '',
        'group': '',
        'similar': [],
        'categories': []
    }
    index = 0
    
    while line:
        if ('ASIN: ' in line[:6]):
            product['asin'] = line[6:].strip()
            product['title'] = ''
            product['group'] = ''
            product['similar']
        elif ('  title: ' == line[:9]):
            product['title'] = line[9:].strip()
        elif ('  group: ' == line[:9]):
            product['group'] = line[9:].strip()
        elif ('  similar: ' == line[:11]):
            product['similar'] = line.strip().split()[2:]
        elif ('  categories: ' in line):
            line = f.readline()
            while ('|' in line):
                product['categories'].append(re.sub(r'\[.*?\]', '', line.strip()).split('|')[1:])
                line = f.readline()
            df[index] = product
            
            product = {
                'asin': '',
                'title': '',
                'group': '',
                'similar': [],
                'categories': []
            }
            
            index += 1
            
        line = f.readline()

df = pd.DataFrame.from_dict(df, orient='index')
df['title_n'] = df['title'].apply(lambda x: normalize_in_list([x], type='item')[0])

In [23]:
df.head(3)

Unnamed: 0,group,similar,asin,title,categories,title_n
0,Book,"['0804215715', '156101074X', '0687023955', '06...",827229534,Patterns of Preaching: A Sermon Sampler,"[['Books', 'Subjects', 'Religion & Spiritualit...","['patterns', 'preaching', 'sermon', 'sampler']"
1,Book,"['0738700827', '1567184960', '1567182836', '07...",738700797,Candlemas: Feast of Flames,"[['Books', 'Subjects', 'Religion & Spiritualit...","['candlemas', 'feast', 'flames']"
2,Book,[],486287785,World War II Allied Fighter Planes Trading Cards,"[['Books', 'Subjects', 'Home & Garden', 'Craft...","['world', 'war', 'ii', 'allied', 'fighter', 'p..."


In [21]:
df.to_csv('amazon-metadata.csv', mode = 'w', index=False)

In [22]:
df = pd.read_csv('amazon-metadata.csv')

In [65]:
vectorizer = TfidfVectorizer(max_df=0.5, 
                             max_features=None, 
                             stop_words='english', 
                             use_idf=True)
X = vectorizer.fit_transform(df.title_n)

svd = TruncatedSVD(200)
normalizer = Normalizer(copy=False)
pipe_it_up = make_pipeline(svd, normalizer)
X2 = pipe_it_up.fit_transform(X).astype('float32')

tags = list(df.group.unique())
Y = np_utils.to_categorical(df.group.apply(lambda x: tags.index(x)), len(tags)).astype(bool)

In [67]:
inputs = Input(shape=(X2.shape[1],))
groups = Dense(Y.shape[1], activation='relu')(inputs)

model = Model(input=inputs, output=groups)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X2, Y, batch_size=128, shuffle=True,
          nb_epoch=50,  verbose=1, 
          validation_split=0.25)

Train on 332121 samples, validate on 110708 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
 63360/332121 [====>.........................] - ETA: 3s - loss: nan - acc: 0.7265

KeyboardInterrupt: 