In [112]:
# This script is to build a multi classification model using neural links
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns 
from keras import layers, models

In [113]:
# Read the data
data_starbucks = pd.read_csv("out\processed_data_starbucks.csv", encoding = 'cp1252')
data_starbucks.head()

Unnamed: 0.1,Unnamed: 0,Beverage_category,Beverage,Beverage_prep,Calories,Total Fat (g),Trans Fat (g),Saturated Fat (g),Sodium (mg),Total Carbohydrates (g),Cholesterol (mg),Dietary Fibre (g),Sugars (g),Protein (g),Vitamin A (% DV),Vitamin C (% DV),Calcium (% DV),Iron (% DV),Caffeine (mg)
0,0,Coffee,Brewed Coffee,Short,3,0.1,0.0,0.0,0,5,0,0,0,0.3,0.0,0.0,0.0,0.0,175.0
1,1,Coffee,Brewed Coffee,Tall,4,0.1,0.0,0.0,0,10,0,0,0,0.5,0.0,0.0,0.0,0.0,260.0
2,2,Coffee,Brewed Coffee,Grande,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0.0,0.0,0.0,0.0,330.0
3,3,Coffee,Brewed Coffee,Venti,5,0.1,0.0,0.0,0,10,0,0,0,1.0,0.0,0.0,0.02,0.0,410.0
4,4,Classic Espresso Drinks,CaffÃƒÂ¨ Latte,Short Nonfat Milk,70,0.1,0.1,0.0,5,75,10,0,9,6.0,0.1,0.0,0.2,0.0,75.0


In [114]:
# Split in features and labels
X = data_starbucks.iloc[:,4:].values # Values important as a way to get ready the inputs for the model
y = data_starbucks["Beverage_category"].values
#y = y.reshape(-1, 1)

In [115]:
print(data_starbucks["Beverage_category"].unique())

['Coffee' 'Classic Espresso Drinks' 'Signature Espresso Drinks'
 'TazoÃ‚Â® Tea Drinks' 'Shaken Iced Beverages' 'Smoothies'
 'FrappuccinoÃ‚Â® Blended Coffee' 'FrappuccinoÃ‚Â® Light Blended Coffee'
 'FrappuccinoÃ‚Â® Blended CrÃƒÂ¨me']


In [116]:
y = pd.DataFrame(y)
y.replace(['Coffee', 'Classic Espresso Drinks', "Signature Espresso Drinks", "TazoÃ‚Â® Tea Drinks", "Shaken Iced Beverages", 
           "Smoothies", "FrappuccinoÃ‚Â® Blended Coffee", "FrappuccinoÃ‚Â® Light Blended Coffee", "FrappuccinoÃ‚Â® Blended CrÃƒÂ¨me"], 
          [0, 1, 2, 3, 4, 5, 6, 7, 8], inplace=True)
y

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1
...,...
236,8
237,8
238,8
239,8


In [120]:
from keras.utils.np_utils import to_categorical
y_onehot = to_categorical(y)
y_onehot[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [136]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size = 0.2, random_state = 0)

# Split Train between train and validate / 49 is the 20% of the balanced dataset
X_val = X_train[:49]
partial_x_train = X_train[49:]

y_val = y_train[:49]
partial_y_train = y_train[49:]

In [150]:
# initial neurallink + Dropout to avoid overfitting
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(15,))) # related with datatrain shape
model.add(layers.Dropout(0.2))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(9, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])


In [165]:
# Training the model
proccesing = model.fit(partial_x_train,
                   partial_y_train,
                   epochs=20,
                   batch_size=50,
                   validation_data=(X_val,y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [166]:
# Evaluate with test
model.evaluate(X_test, y_test)



[0.8193354606628418, 0.7551020383834839]

In [167]:
# Predict
predictions = model.predict(X_test)



In [168]:
# La prediccion = 3 = TazoÃ‚Â® Tea Drinks
print(np.argmax(predictions[0]))
print(y_test[0])


3
[0. 0. 0. 1. 0. 0. 0. 0. 0.]


In [182]:
# Manual Cross validation due to ram problems
k = 10
num_val_samples = len(partial_x_train) // k
num_epoch = 20
all_history = []
for i in range(k):
    print("Fold " , i)
    val_data = partial_x_train[i*num_val_samples: (i+1) * num_val_samples] 
    val_targets = partial_y_train[i*num_val_samples: (i+1) * num_val_samples]
    
    partial_train_data = np.concatenate( 
    [partial_x_train[:i * num_val_samples],
     partial_x_train[(i+1) * num_val_samples:]],
     axis= 0 
    )
    
    partial_train_targets = np.concatenate(
    [partial_y_train[:i * num_val_samples],
     partial_y_train[(i+1) * num_val_samples:]],
     axis= 0   
    )  
    
    history = model.fit(partial_train_data, partial_train_targets, epochs=num_epoch, batch_size =50,
                        validation_data = (val_data, val_targets),
                        verbose=0)
    all_history.append(history.history['val_acc'])

Fold  0
Fold  1
Fold  2
Fold  3
Fold  4
Fold  5
Fold  6
Fold  7
Fold  8
Fold  9


In [189]:
# Mean per line to cross validation
all_acc_avg = pd.DataFrame(all_history).mean(axis=0) 
all_acc_avg

0     0.928571
1     0.914286
2     0.935714
3     0.921429
4     0.935714
5     0.935714
6     0.914286
7     0.914286
8     0.921429
9     0.907143
10    0.907143
11    0.921429
12    0.907143
13    0.900000
14    0.900000
15    0.907143
16    0.914286
17    0.907143
18    0.900000
19    0.892857
dtype: float64

In [190]:
# I made a "manual" cross validation, due to problems with ram and KerasClassifier. Nevertheless the code is below this box.

### Conclusion ###
#It was possible to make a multiclass classification through the use of a neural network with a great accuracy

In [169]:
# Before cross validation
from keras.wrappers.scikit_learn import KerasClassifier
classifier = KerasClassifier(build_fn=model, epochs=20, batch_size=50)

  classifier = KerasClassifier(build_fn=model, epochs=20, batch_size=50)


In [173]:
# GPU connection
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[]

In [176]:
 # Save model
from keras.models import load_model
model.save('model.h5')
classifier_final = load_model('model.h5')

In [178]:
# Cross (There are problem with the RAM)
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

kfold_validacion=KFold(n_splits=10, shuffle=True)
results = cross_val_score(classifier, partial_x_train, partial_y_train, cv=kfold_validacion)
#print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

#results = cross_validate(classifier, partial_x_train, partial_y_train, cv=kfold_validacion, return_train_score=True, scoring="accuracy")
# print(results)
# print(results.mean())

INFO:tensorflow:Assets written to: ram://5ba6f372-9195-4bb7-8f14-95272e9e7015/assets


FileNotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ram://a1dd5ffb-5239-40c4-8dce-c766762b0d44/variables/variables
 You may be trying to load on a different device from the computational device. Consider setting the `experimental_io_device` option in `tf.saved_model.LoadOptions` to the io_device such as '/job:localhost'.