In [None]:
import tensorflow as tf

imdb = tf.keras.datasets.imdb
to_categorical = tf.keras.utils.to_categorical
sequence = tf.keras.preprocessing.sequence

import plotly
import plotly.graph_objs as go
import plotly.express as px

from matplotlib import pyplot

import numpy

from sklearn.datasets import make_circles, make_blobs
from sklearn.model_selection import train_test_split

from pandas import DataFrame
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPUs')
    except RuntimeError as e:
        print(e)

In [None]:
top_words = 5000
(x_train, y_train), (x_val, y_val) = imdb.load_data(num_words=top_words)
max_words = 500

x_train.shape, y_train.shape, x_val.shape, y_val.shape

In [None]:
imdb.get_word_index()

In [None]:
x = numpy.concatenate((x_train, x_val), axis=0)

print("Number of words:", len(numpy.unique(numpy.hstack(x))))

In [None]:
print("Review length: ")
result = [len(st) for st in x]
print("Mean %.2f words (%f)" % (numpy.mean(result), numpy.std(result)))

pyplot.boxplot(result)

pyplot.savefig('review_length.png', dpi = 300)

In [None]:
x_train = sequence.pad_sequences(x_train, maxlen=max_words)
x_val = sequence.pad_sequences(x_val, maxlen=max_words)

x_train.shape, y_train.shape

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(4, input_dim=max_words, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=200, batch_size=128, verbose=2)


In [None]:
h1 = go.Scatter(y=history.history['loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="loss"
                   )
h2 = go.Scatter(y=history.history['val_loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_loss"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Loss',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Underfit')


In [None]:
h1 = go.Scatter(y=history.history['accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="acc"
                   )
h2 = go.Scatter(y=history.history['val_accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_acc"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Accuracy',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Underfit')


# **Overfit Learning Curve**

In [None]:
x, y =  make_circles(n_samples=500, noise=0.2, random_state=1)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.5, shuffle= True)
x_train.shape, y_train.shape, x_val.shape, y_val.shape

In [None]:
x_train_pd = pd.DataFrame(x_train, columns=['x', 'y'])
y_train_pd = pd.DataFrame(y_train, columns=['class'])

df = pd.concat([x_train_pd, y_train_pd], axis=1)
df["class"] = df["class"].astype(str)

In [None]:
fig = px.scatter(df, x="x", y="y", color="class")
fig.show()

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(60, input_dim=2, activation='relu'))
model.add(tf.keras.layers.Dense(30,  activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=500, verbose=1)


In [None]:
h1 = go.Scatter(y=history.history['loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="loss"
                   )
h2 = go.Scatter(y=history.history['val_loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_loss"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Loss',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Overfit')

In [None]:
h1 = go.Scatter(y=history.history['accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="acc"
                   )
h2 = go.Scatter(y=history.history['val_accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_acc"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Accuracy',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Overfit')

# **Good Fit Learning Curve**

In [None]:
x, y = make_blobs(n_samples=3000, centers=3, n_features=2, cluster_std=2, random_state=2)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.4, shuffle= True)

x_train.shape, x_val.shape, y_train.shape, y_val.shape

In [None]:
x_train_pd = pd.DataFrame(x_train, columns=['x', 'y'])
y_train_pd = pd.DataFrame(y_train, columns=['class'])

df = pd.concat([x_train_pd, y_train_pd], axis=1)
df["class"] = df["class"].astype(str)

In [None]:
fig = px.scatter(df, x="x", y="y", color="class")
fig.show()

In [None]:
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=500, verbose=1)

In [None]:
h1 = go.Scatter(y=history.history['loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="loss"
                   )
h2 = go.Scatter(y=history.history['val_loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_loss"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Loss',
                   xaxis=dict(title='epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Good Fit')

In [None]:
h1 = go.Scatter(y=history.history['accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="acc"
                   )
h2 = go.Scatter(y=history.history['val_accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_acc"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Accuracy',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Good Fit')

# **Unrepresentative Train Dataset**

In [None]:
x, y = make_blobs(n_samples=100, centers=3, n_features=2, cluster_std=2, random_state=2)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.5, shuffle= True)

x_train.shape, x_val.shape, y_train.shape, y_val.shape

In [None]:
x_train_pd = pd.DataFrame(x_train, columns=['x', 'y'])
y_train_pd = pd.DataFrame(y_train, columns=['class'])

df = pd.concat([x_train_pd, y_train_pd], axis=1)
df["class"] = df["class"].astype(str)

In [None]:
fig = px.scatter(df, x="x", y="y", color="class")
fig.show()

In [None]:
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=200, verbose=1)

In [None]:
h1 = go.Scatter(y=history.history['loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="loss"
                   )
h2 = go.Scatter(y=history.history['val_loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_loss"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Loss',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Unrepresentative Train Dataset')

In [None]:
h1 = go.Scatter(y=history.history['accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="acc"
                   )
h2 = go.Scatter(y=history.history['val_accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_acc"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Accuracy',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Unrepresentative Train Dataset')

# **Unrepresentative Validation Dataset**

In [None]:
x, y = make_blobs(n_samples=500, centers=3, n_features=2, cluster_std=10, random_state=2)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.05, shuffle= True)

x_train.shape, x_val.shape, y_train.shape, y_val.shape

In [None]:
x_train_pd = pd.DataFrame(x_train, columns=['x', 'y'])
y_train_pd = pd.DataFrame(y_train, columns=['class'])

df = pd.concat([x_train_pd, y_train_pd], axis=1)
df["class"] = df["class"].astype(str)

In [None]:
fig = px.scatter(df, x="x", y="y", color="class")
fig.show()

**สถานการณ์ที่ 1 (Validation Dataset น้อย และไม่สามารถเป็นตัวแทนของ Validation Dataset ได้)**

In [None]:
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=200, verbose=1)

In [None]:
h1 = go.Scatter(y=history.history['loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="loss"
                   )
h2 = go.Scatter(y=history.history['val_loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_loss"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Loss',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Unrepresentative Validation Dataset')

In [None]:
h1 = go.Scatter(y=history.history['accuracy'], 
                    mode="lines", line=dict(
                    width=2,
                    color='blue'),
                    name="acc"
                   )
h2 = go.Scatter(y=history.history['val_accuracy'], 
                    mode="lines", line=dict(
                    width=2,
                    color='red'),
                    name="val_acc"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Accuracy',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Unrepresentative Validation Dataset')

**สถานการณ์ที่ 2 (Validation Dataset น้อย และง่ายเกินไป)**

In [None]:
x, y = make_blobs(n_samples=400, centers=3, n_features=2, cluster_std=2, random_state=2)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.03, shuffle= True)

x_train.shape, x_val.shape, y_train.shape, y_val.shape

In [None]:
x_train_pd = pd.DataFrame(x_train, columns=['x', 'y'])
y_train_pd = pd.DataFrame(y_train, columns=['class'])

df = pd.concat([x_train_pd, y_train_pd], axis=1)
df["class"] = df["class"].astype(str)

In [None]:
fig = px.scatter(df, x="x", y="y", color="class")
fig.show()

In [None]:
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=200, verbose=1)

In [None]:
h1 = go.Scatter(y=history.history['loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="loss"
                   )
h2 = go.Scatter(y=history.history['val_loss'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_loss"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Loss',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Unrepresentative Validation Dataset')

In [None]:
h1 = go.Scatter(y=history.history['accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='blue'),
                    name="acc"
                   )
h2 = go.Scatter(y=history.history['val_accuracy'], 
                    mode="lines",
                    line=dict(
                        width=2,
                        color='red'),
                    name="val_acc"
                   )

data = [h1,h2]
layout1 = go.Layout(title='Accuracy',
                   xaxis=dict(title='Epochs'),
                   yaxis=dict(title=''))
fig1 = go.Figure(data = data, layout=layout1)
plotly.offline.iplot(fig1, filename='Unrepresentative Validation Dataset')