In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [3]:
df=pd.read_csv('car_review_source.csv',sep='|')

In [4]:
train_size = 55721
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(df) - train_size))

Train size: 55721
Test size: 1000


In [5]:
train_posts = df.Review[:train_size]
train_tags = df.Factory[:train_size]

test_posts = df.Review[train_size:]
test_tags = df.Factory[train_size:]

In [6]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [7]:
tokenize.fit_on_texts(train_posts) # only fit on train
x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

In [8]:
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [10]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (55721, 1000)
x_test shape: (1000, 1000)
y_train shape: (55721,)
y_test shape: (1000,)


In [11]:
print(train_posts[0])
print(train_tags[0])

 This is my second Stratus. This time I took all the options.  The car is a reliable car, it is comfortable, the sound system is great, and it is so fun to drive with the V6 and the 17" wheels. It is pretty spacious inside, and the trunk space is generous (particularly with the fold down rear seat).  It's not a Mercedes or BMW, but it's a good car for the $19K +/- I paid ($26K sticker).
Dodge


In [12]:
batch_size = 32
epochs = 10

In [13]:
from datetime import datetime
import numpy as np
from os.path import exists, join

In [14]:
logdir="logs/"

In [15]:
brands=["Dodge", "Ferrari", "Fiat", "Ford", "Genesis", "Hummer", "Hyundai", "Infiniti", "Isuzu", "Jaguar", "Jeep", "Kia"];

In [16]:
metadata_file = open(join(logdir, 'metadata.tsv'), 'w')
metadata_file.write('Class\tName\n')

for i in range(len(y_test)):
    metadata_file.write('{}\t{}\n'.format(y_test[i], brands[y_test[i]]))    
metadata_file.close()

In [17]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [18]:
from keras.callbacks import TensorBoard

In [19]:
tensorboard = TensorBoard(batch_size=batch_size,
                          embeddings_freq=1,
                          embeddings_layer_names=['Brand'],
                          embeddings_metadata='metadata.tsv',
                          embeddings_data=x_test)

In [20]:
# Build the model
model = Sequential()
#First neurons layer with 512 nodes 
model.add(Dense(512, input_shape=(max_words,), name='Brand'))
model.add(Activation('relu'))
model.add(Dropout(0.5))

#last neurons layer with 12 nodes 
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

W0416 13:31:44.338338 139637095679744 deprecation.py:506] From /usr/local/lib/python3.5/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Brand (Dense)                (None, 512)               512512    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                6156      
_________________________________________________________________
activation_2 (Activation)    (None, 12)                0         
Total params: 518,668
Trainable params: 518,668
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.fit(x_train, y_train,
                    batch_size=batch_size,
                    callbacks=[tensorboard],
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 50148 samples, validate on 5573 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


W0416 13:34:42.343876 139637095679744 deprecation.py:323] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/training/saver.py:965: remove_checkpoint (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to delete files with this prefix.


Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7eff5e911ef0>

In [23]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.622
